HttpClient代理访问网站并下载一千张美女图

HttpClient代理访问网站并下载一千张美女图

HttpClient代理访问网站并爬虫下载一千张图片

proxy类

package proxy;
import com.alibaba.fastjson.JSONObject;
import proxy.HttpClientUtils;
import proxy.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;
 
/**
 * A simple PageProcessor.
 *
 * @author liyangwei
 * @since 0.1.0
 */
@SuppressWarnings("unused")
public class proxy {
 
    private String url;
    private SougouImgPipeline pipeline;
    private List<JSONObject> dataList;
    private List<String> urlList;
    private String word;
 
    public proxy(String url,String word) {
        this.url = url;
        this.word = word;
        this.pipeline = new SougouImgPipeline();
        this.dataList = new ArrayList<>();
        this.urlList = new ArrayList<>();
    }
 
    public void process(int idx, int size) {
        String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
        JSONObject object = JSONObject.parseObject(res);
        @SuppressWarnings("unchecked")
		List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");
        for(JSONObject item : items){
            this.urlList.add(item.getString("picUrl"));
        }
        this.dataList.addAll(items);
    }
 
    // 下载
    public void pipelineData(){
        //pipeline.process(this.urlList, word);   // 单线程
        pipeline.processSync(this.urlList, this.word);    // 多线程
    }
 
 
    public static void main(String[] args) {
        String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
        proxy processor = new proxy(url,"美女");
        int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量
        for(int i=start;i<start+limit;i+=size) {
        	//HttpClient请求
            processor.process(i, size);
        }
        //下载
        processor.pipelineData();

    }
 
}

ProxyConfig类

此类为代理ip和端口以及代理服务器账号密码配置类

package proxy;

import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.impl.client.BasicCredentialsProvider;

//代理ip类
public class ProxyConfig {
	HttpHost proxy;
    CredentialsProvider provider;
	ProxyConfig(){
		//创建代理
	    proxy = new HttpHost("10.36.6.66",3128);
	    provider = new BasicCredentialsProvider();
	    //代理配置账号密码
	    provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("F1232170","44DRunBA"));
	}
	public HttpHost get_proxy() {
		return this.proxy;
	}
	public CredentialsProvider get_provider() {
		return this.provider;
	}
}

HttpClientUtils类

此类为HttpClient访问网站类

package proxy;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

//HttpClient请求类
public abstract class HttpClientUtils {

    public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);
 
    /**
     * 创建http的get请求
     *
     * @HttpGET
     */
    public static String get(String url) {
    	String charset = "UTF-8";
        HttpGet httpGet = new HttpGet(url);
    	//添加异步请求头
        //httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
        //有代理HttpClient请求
        return executeRequest_proxy(httpGet, charset);
        //无代理HttpClient请求
        //return executeRequest(httpGet, charset);

    }
    /**
     * HttpClient请求
     *
     * @HttpClient
     */
    public static String executeRequest_proxy(HttpGet httpGet, String charset) {
    	//创建有代理的HttpClient
    	CloseableHttpClient httpclient;
    	//创建代理类对象
    	ProxyConfig proxyconfig = new ProxyConfig();
    	//获取代理
    	HttpHost proxy = proxyconfig.get_proxy();
    	//代理配置代理ip和端口
    	RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
    	//代理配置账号和密码
    	CredentialsProvider provider = proxyconfig.get_provider();
    	//Http的get请求配置代理Ip和端口
    	httpGet.setConfig(config);
    	//HttpClient请求配置代理账号和密码
    	httpclient = createSSLInsecureClient();
        httpclient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
        String result = "";
        try {
            try {
            	System.out.println("开始HttpClient请求");
                CloseableHttpResponse response = httpclient.execute(httpGet); 
            	System.out.println("HttpClient请求完成");
                HttpEntity entity = null;
                try {
                    entity = response.getEntity();
                    result = EntityUtils.toString(entity, charset);
                } finally {
                    EntityUtils.consume(entity);//将会释放所有由httpEntity所持有的资源
                    response.close();
                }
            } finally {
                httpclient.close();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }
    
    /**
     * HttpClient请求(无代理)
     *
     * @HttpClient(无代理)
     */
    public static String executeRequest(HttpUriRequest httpRequest, String charset) {
    	CloseableHttpClient httpclient;
        if ("https".equals(httpRequest.getURI().getScheme())) {
            httpclient = createSSLInsecureClient();
        } else {
            httpclient = HttpClients.createDefault();
        }
        String result = "";
        try {
            try {
            	System.out.println("无代理HttpClient请求开始");
                CloseableHttpResponse response = httpclient.execute(httpRequest);
            	System.out.println("无代理HttpClient请求完成");
                HttpEntity entity = null;
                try {
                    entity = response.getEntity();
                    result = EntityUtils.toString(entity, charset);
                } finally {
                    EntityUtils.consume(entity);
                    response.close();
                }
            } finally {
                httpclient.close();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }
 
 
    /**
     * 创建 SSL连接
     */
    public static CloseableHttpClient createSSLInsecureClient() {
        try {
        	System.out.println("SSLContext");
			// 相信自己的CA和所有自签名的证书
            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {
                @Override
                public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                    return true;
                }
            }).build();
        	System.out.println("SSLConnectionSocketFactory");
			// 只允许使用TLSv1协议
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {
                @Override
                public boolean verify(String hostname, SSLSession session) {
                    return true;
                }
            });
            //返回SSL链接的HttpClient
            return HttpClients.custom().setSSLSocketFactory(sslsf).build();
        } catch (GeneralSecurityException ex) {
            throw new RuntimeException(ex);
        }
    }
    
  
    
}



SougouImgPipeline类

此类为美女图片下载类

package proxy;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
 
/**
 * Store results in files.<br>
 *图片下载类
 * @author liyangei
 * @since 0.1.0
 */
public class SougouImgPipeline {
 
    private String extension = ".jpg";
    private String path;
 
    private volatile AtomicInteger suc;
    private volatile AtomicInteger fails;
 
    public SougouImgPipeline() {
        setPath("/Users/holmes/eclipse-workspace/proxy/data/picture/");
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }
 
    public SougouImgPipeline(String path) {
        setPath(path);
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }
 
    public SougouImgPipeline(String path, String extension) {
        setPath(path);
        this.extension = extension;
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }
 
    public void setPath(String path) {
        this.path = path;
    }
 
    /**
     * 下载
     *
     * @param url
     * @param cate
     * @throws Exception
     */
    private void downloadImg(String url, String cate, String name) throws Exception {
    	System.out.println("下载");
    	String path = this.path + "/" + cate + "/";
        File dir = new File(path);
        if (!dir.exists()) {    // 目录不存在则创建目录
            dir.mkdirs();
        }
        String realExt = url.substring(url.lastIndexOf("."));   // 获取扩展名
        String fileName = name + realExt;
        //System.out.println(fileName);
        fileName = fileName.replace("-", "");
        String filePath = path + fileName;
        //System.out.println(filePath);
        File img = new File(filePath);
        if(img.exists()){   // 若文件之前已经下载过,则跳过
            System.out.println(String.format("文件%s已存在本地目录",fileName));
            return;
        }
        //System.out.println(url);
        URLConnection con = new URL(url).openConnection();
        con.setConnectTimeout(5000);
        con.setReadTimeout(5000);
        InputStream inputStream = con.getInputStream();
        byte[] bs = new byte[1024];
        File file = new File(filePath);
        @SuppressWarnings("resource")
		FileOutputStream os = new FileOutputStream(file, true);
        // 开始读取 写入
        int len;
        while ((len = inputStream.read(bs)) != -1) {
            os.write(bs, 0, len);
        }
        System.out.println("picUrl: " + url);
        System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement()));
    }
 
    /**
     * 单线程处理
     *
     * @param data
     * @param word
     */
    public void process(List<String> data, String word) {
        long start = System.currentTimeMillis();
        for (String picUrl : data) {
            if (picUrl == null)
                continue;
            try {
                downloadImg(picUrl, word, picUrl);
            } catch (Exception e) {
//               e.printStackTrace();
                fails.incrementAndGet();
            }
        }
        System.out.println("下载成功: " + suc.get());
        System.out.println("下载失败: " + fails.get());
        long end = System.currentTimeMillis();
        System.out.println("耗时:" + (end - start) / 1000 + "秒");
    }
 
 
    /**
     * 多线程处理
     *
     * @param data
     * @param word
     */
    public void processSync(List<String> data, String word) {
        long start = System.currentTimeMillis();
        int count = 0;
        ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池
        for (int i=0;i<data.size();i++) {
            String picUrl = data.get(i);
            if (picUrl == null)
                continue;
            String name = "";
            if(i<10){
                name="000"+i;
            }else if(i<100){
                name="00"+i;
            }else if(i<1000){
                name="0"+i;
            }
            String finalName = name;
            executorService.execute(() -> {
                try {
                    downloadImg(picUrl, word, finalName);
                } catch (Exception e) {
//                    e.printStackTrace();
                    fails.incrementAndGet();
                }
            });
            count++;
        }
        executorService.shutdown();
        try {
            if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
                // 超时的时候向线程池中所有的线程发出中断(interrupted)。
                //                executorService.shutdownNow();
            }
            System.out.println("AwaitTermination Finished");
            System.out.println("共有URL: "+data.size());
            System.out.println("下载成功: " + suc);
            System.out.println("下载失败: " + fails);
 
            File dir = new File(this.path + "/" + word + "/");
            int len = Objects.requireNonNull(dir.list()).length;
            System.out.println("当前共有文件: "+len);
 
            long end = System.currentTimeMillis();
            System.out.println("耗时:" + (end - start) / 1000.0 + "秒");
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
 
    }
 
 
    /**
     * 多线程分段处理
     *
     * @param data
     * @param word
     * @param threadNum
     */
    public void processSync2(List<String> data, final String word, int threadNum) {
        if (data.size() < threadNum) {
            process(data, word);
        } else {
            ExecutorService executorService = Executors.newCachedThreadPool();
            int num = data.size() / threadNum;    //每段要处理的数量
            for (int i = 0; i < threadNum; i++) {
                int start = i * num;
                int end = (i + 1) * num;
                if (i == threadNum - 1) {
                    end = data.size();
                }
                final List<String> cutList = data.subList(start, end);
                executorService.execute(() -> process(cutList, word));
            }
            executorService.shutdown();
        }
    }
 
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值