HttpClient代理访问网站并下载一千张美女图
HttpClient代理访问网站并爬虫下载一千张图片
proxy类
package proxy;
import com.alibaba.fastjson.JSONObject;
import proxy.HttpClientUtils;
import proxy.SougouImgPipeline;
import java.util.ArrayList;
import java.util.List;
/**
* A simple PageProcessor.
*
* @author liyangwei
* @since 0.1.0
*/
@SuppressWarnings("unused")
public class proxy {
private String url;
private SougouImgPipeline pipeline;
private List<JSONObject> dataList;
private List<String> urlList;
private String word;
public proxy(String url,String word) {
this.url = url;
this.word = word;
this.pipeline = new SougouImgPipeline();
this.dataList = new ArrayList<>();
this.urlList = new ArrayList<>();
}
public void process(int idx, int size) {
String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
JSONObject object = JSONObject.parseObject(res);
@SuppressWarnings("unchecked")
List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");
for(JSONObject item : items){
this.urlList.add(item.getString("picUrl"));
}
this.dataList.addAll(items);
}
// 下载
public void pipelineData(){
//pipeline.process(this.urlList, word); // 单线程
pipeline.processSync(this.urlList, this.word); // 多线程
}
public static void main(String[] args) {
String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
proxy processor = new proxy(url,"美女");
int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量
for(int i=start;i<start+limit;i+=size) {
//HttpClient请求
processor.process(i, size);
}
//下载
processor.pipelineData();
}
}
ProxyConfig类
此类为代理ip和端口以及代理服务器账号密码配置类
package proxy;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.impl.client.BasicCredentialsProvider;
//代理ip类
public class ProxyConfig {
HttpHost proxy;
CredentialsProvider provider;
ProxyConfig(){
//创建代理
proxy = new HttpHost("10.36.6.66",3128);
provider = new BasicCredentialsProvider();
//代理配置账号密码
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("F1232170","44DRunBA"));
}
public HttpHost get_proxy() {
return this.proxy;
}
public CredentialsProvider get_provider() {
return this.provider;
}
}
HttpClientUtils类
此类为HttpClient访问网站类
package proxy;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
//HttpClient请求类
public abstract class HttpClientUtils {
public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);
/**
* 创建http的get请求
*
* @HttpGET
*/
public static String get(String url) {
String charset = "UTF-8";
HttpGet httpGet = new HttpGet(url);
//添加异步请求头
//httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
//有代理HttpClient请求
return executeRequest_proxy(httpGet, charset);
//无代理HttpClient请求
//return executeRequest(httpGet, charset);
}
/**
* HttpClient请求
*
* @HttpClient
*/
public static String executeRequest_proxy(HttpGet httpGet, String charset) {
//创建有代理的HttpClient
CloseableHttpClient httpclient;
//创建代理类对象
ProxyConfig proxyconfig = new ProxyConfig();
//获取代理
HttpHost proxy = proxyconfig.get_proxy();
//代理配置代理ip和端口
RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
//代理配置账号和密码
CredentialsProvider provider = proxyconfig.get_provider();
//Http的get请求配置代理Ip和端口
httpGet.setConfig(config);
//HttpClient请求配置代理账号和密码
httpclient = createSSLInsecureClient();
httpclient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
String result = "";
try {
try {
System.out.println("开始HttpClient请求");
CloseableHttpResponse response = httpclient.execute(httpGet);
System.out.println("HttpClient请求完成");
HttpEntity entity = null;
try {
entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
} finally {
EntityUtils.consume(entity);//将会释放所有由httpEntity所持有的资源
response.close();
}
} finally {
httpclient.close();
}
} catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
/**
* HttpClient请求(无代理)
*
* @HttpClient(无代理)
*/
public static String executeRequest(HttpUriRequest httpRequest, String charset) {
CloseableHttpClient httpclient;
if ("https".equals(httpRequest.getURI().getScheme())) {
httpclient = createSSLInsecureClient();
} else {
httpclient = HttpClients.createDefault();
}
String result = "";
try {
try {
System.out.println("无代理HttpClient请求开始");
CloseableHttpResponse response = httpclient.execute(httpRequest);
System.out.println("无代理HttpClient请求完成");
HttpEntity entity = null;
try {
entity = response.getEntity();
result = EntityUtils.toString(entity, charset);
} finally {
EntityUtils.consume(entity);
response.close();
}
} finally {
httpclient.close();
}
} catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
/**
* 创建 SSL连接
*/
public static CloseableHttpClient createSSLInsecureClient() {
try {
System.out.println("SSLContext");
// 相信自己的CA和所有自签名的证书
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
return true;
}
}).build();
System.out.println("SSLConnectionSocketFactory");
// 只允许使用TLSv1协议
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {
@Override
public boolean verify(String hostname, SSLSession session) {
return true;
}
});
//返回SSL链接的HttpClient
return HttpClients.custom().setSSLSocketFactory(sslsf).build();
} catch (GeneralSecurityException ex) {
throw new RuntimeException(ex);
}
}
}
SougouImgPipeline类
此类为美女图片下载类
package proxy;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store results in files.<br>
*图片下载类
* @author liyangei
* @since 0.1.0
*/
public class SougouImgPipeline {
private String extension = ".jpg";
private String path;
private volatile AtomicInteger suc;
private volatile AtomicInteger fails;
public SougouImgPipeline() {
setPath("/Users/holmes/eclipse-workspace/proxy/data/picture/");
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public SougouImgPipeline(String path) {
setPath(path);
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public SougouImgPipeline(String path, String extension) {
setPath(path);
this.extension = extension;
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public void setPath(String path) {
this.path = path;
}
/**
* 下载
*
* @param url
* @param cate
* @throws Exception
*/
private void downloadImg(String url, String cate, String name) throws Exception {
System.out.println("下载");
String path = this.path + "/" + cate + "/";
File dir = new File(path);
if (!dir.exists()) { // 目录不存在则创建目录
dir.mkdirs();
}
String realExt = url.substring(url.lastIndexOf(".")); // 获取扩展名
String fileName = name + realExt;
//System.out.println(fileName);
fileName = fileName.replace("-", "");
String filePath = path + fileName;
//System.out.println(filePath);
File img = new File(filePath);
if(img.exists()){ // 若文件之前已经下载过,则跳过
System.out.println(String.format("文件%s已存在本地目录",fileName));
return;
}
//System.out.println(url);
URLConnection con = new URL(url).openConnection();
con.setConnectTimeout(5000);
con.setReadTimeout(5000);
InputStream inputStream = con.getInputStream();
byte[] bs = new byte[1024];
File file = new File(filePath);
@SuppressWarnings("resource")
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取 写入
int len;
while ((len = inputStream.read(bs)) != -1) {
os.write(bs, 0, len);
}
System.out.println("picUrl: " + url);
System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement()));
}
/**
* 单线程处理
*
* @param data
* @param word
*/
public void process(List<String> data, String word) {
long start = System.currentTimeMillis();
for (String picUrl : data) {
if (picUrl == null)
continue;
try {
downloadImg(picUrl, word, picUrl);
} catch (Exception e) {
// e.printStackTrace();
fails.incrementAndGet();
}
}
System.out.println("下载成功: " + suc.get());
System.out.println("下载失败: " + fails.get());
long end = System.currentTimeMillis();
System.out.println("耗时:" + (end - start) / 1000 + "秒");
}
/**
* 多线程处理
*
* @param data
* @param word
*/
public void processSync(List<String> data, String word) {
long start = System.currentTimeMillis();
int count = 0;
ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池
for (int i=0;i<data.size();i++) {
String picUrl = data.get(i);
if (picUrl == null)
continue;
String name = "";
if(i<10){
name="000"+i;
}else if(i<100){
name="00"+i;
}else if(i<1000){
name="0"+i;
}
String finalName = name;
executorService.execute(() -> {
try {
downloadImg(picUrl, word, finalName);
} catch (Exception e) {
// e.printStackTrace();
fails.incrementAndGet();
}
});
count++;
}
executorService.shutdown();
try {
if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
// 超时的时候向线程池中所有的线程发出中断(interrupted)。
// executorService.shutdownNow();
}
System.out.println("AwaitTermination Finished");
System.out.println("共有URL: "+data.size());
System.out.println("下载成功: " + suc);
System.out.println("下载失败: " + fails);
File dir = new File(this.path + "/" + word + "/");
int len = Objects.requireNonNull(dir.list()).length;
System.out.println("当前共有文件: "+len);
long end = System.currentTimeMillis();
System.out.println("耗时:" + (end - start) / 1000.0 + "秒");
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 多线程分段处理
*
* @param data
* @param word
* @param threadNum
*/
public void processSync2(List<String> data, final String word, int threadNum) {
if (data.size() < threadNum) {
process(data, word);
} else {
ExecutorService executorService = Executors.newCachedThreadPool();
int num = data.size() / threadNum; //每段要处理的数量
for (int i = 0; i < threadNum; i++) {
int start = i * num;
int end = (i + 1) * num;
if (i == threadNum - 1) {
end = data.size();
}
final List<String> cutList = data.subList(start, end);
executorService.execute(() -> process(cutList, word));
}
executorService.shutdown();
}
}
}