本文地址:http://blog.csdn.net/qq_34199125/article/details/54811840
部分参考https://www.oschina.net/code/snippet_1034405_56137
用httpclient4.5.3包中的CloseableHttpResponse获取HTML
用正则表达式过滤出网页中的img src
用多线程下载图片,并且使用简单的线程锁保证图片URL表的数据安全,并记录下下载失败的URL
下面是代码
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.omg.CORBA.Current;
public class HttpClientDemo implements Runnable{
public static HttpClientDemo httpClientdemo;
private static String lock1 = "";
private static String lock2 = "";
private static String lock3 = "";
RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).setConnectionRequestTimeout(6000).setConnectTimeout(6000).build();
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).build();
private static int page = 3000;//初始页数
//存储所有图片的URL
private static ArrayList<ImagePoint> imgSrcs = new ArrayList<>();
private static ArrayList<ImagePoint> downloadImgSrcs = new ArrayList<>();
public static ImagePoint getPoint(){
synchronized (lock3) {
if (!getImgSrcs().isEmpty()){
ImagePoint temp = getImgSrcs().get(0);
getImgSrcs().remove(0);
return temp;
}else
return null;
}
}
public static ArrayList<ImagePoint> getImgSrcs() {
synchronized (lock1) {
return imgSrcs;
}
}
public static void setImgSrcs(ArrayList<ImagePoint> imgSrcs) {
synchronized (lock1){
HttpClientDemo.imgSrcs = imgSrcs;
}
}
public static ArrayList<ImagePoint> getDownloadImgSrcs() {
synchronized (lock2) {
return downloadImgSrcs;
}
}
public static void setDownloadImgSrcs(ArrayList<ImagePoint> downloadImgSrcs) {
synchronized (lock2) {
HttpClientDemo.downloadImgSrcs = downloadImgSrcs;
}
}
//主方法
public static void main(String[] args) throws InterruptedException {
//TODO Auto-generated method stub
System.out.println("5秒后开始抓取煎蛋妹子图……");
httpClientdemo = new HttpClientDemo();
ExecutorService pool = Executors.newCachedThreadPool();
Thread mainThread = new Thread(httpClientdemo);
mainThread.start();
Thread.currentThread().sleep(5000);
while(mainThread.isAlive()||!imgSrcs.isEmpty()){
ImagePoint temp =getPoint();
if((temp)!=null){
//Thread.currentThread().sleep(100);
pool.execute(new Thread(new ImageCreator(temp.getImageSrc(),temp.getPage())));
Thread.currentThread().sleep(50);
}
}
pool.shutdown();
}
@Override
public void run() {
// TODO Auto-generated method stub
for (int i = page; i > 0; i--) {
//创建一个GET请求
HttpGet httpGet = new HttpGet("http://jandan.net/ooxx/page-" + i);
httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36");
httpGet.addHeader("Cookie","_gat=1; nsfw-click-load=off; gif-click-load=on; _ga=GA1.2.1861846600.1423061484");
try {
//控制爬取速度
Thread.sleep(500);
//发送请求,并执行
CloseableHttpResponse response = httpClient.execute(httpGet);
String html = EntityUtils.toString(response.getEntity(), "utf-8");
//System.out.println(html);
//网页内容解析
//HtmlParser.regexString(html, HtmlParser.imgRegex,HtmlParser.urlRegex);
new Thread(new HtmlParser(html, i)).start();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlParser implements Runnable {
//网页过滤图片的src
public static final String imgRegex= "img\\s*src=\".+?\\.(jpg|gif)\"";
//对图片链接修复成http:的URL
public static final String urlRegex= "\".+\"";
private String html;
private static int page;
public HtmlParser(String html,int page) {
this.html = html;
this.page = page;
}
@Override
public void run() {
// TODO Auto-generated method stub
System.out.println("==========第"+page+"页============");
regexString(html, imgRegex,urlRegex);
//new Thread(new ImageCreator(HttpClientDemo.imgSrcs.get(0), page)).start();;
}
//从网页中过滤出图片的URL
static void regexString(String tagetStr,String imgRegex,String urlRegex){
String imgSrc = null;
Pattern pattern = Pattern.compile(imgRegex);
Matcher matcher = pattern.matcher(tagetStr);
while (matcher.find()){
// String img = matcher.group(0);
// System.out.println(img);
Matcher matcherURL = Pattern.compile(urlRegex).matcher(matcher.group(0));
if (matcherURL.find()){
imgSrc = matcherURL.group(0);
//System.out.println(imgSrc);
if(!imgSrc.substring(1, 6).equals("http:")){
imgSrc = imgSrc.substring(0, 0)+"http:"+imgSrc.substring(1,imgSrc.length()-1);
//System.out.println(imgSrc);
HttpClientDemo.getImgSrcs().add(new ImagePoint (imgSrc,page));
}
}
}
}
}
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
public class ImageCreator implements Runnable {
private static int count = 0;
private String imageUrl;
private int page;
//存储路径,自定义
private static final String basePath = "F:/picture";
public ImageCreator(String imageUrl,int page) {
this.imageUrl = imageUrl;
this.page = page;
}
@Override
public void run() {
// TODO Auto-generated method stub
File dir = new File(basePath);
if(!dir.exists()){
dir.mkdirs();
System.out.println("图片存放于"+basePath+"目录下");
}
//System.out.println(imageUrl);
String imageName = imageUrl.substring(imageUrl.lastIndexOf("/")+1);
try {
File file = new File( basePath+"/"+page+"--"+imageName);
OutputStream os = new FileOutputStream(file);
//创建一个url对象
URL url = new URL(imageUrl);
InputStream is = url.openStream();
byte[] buff = new byte[1024];
while(true) {
int readed = is.read(buff);
if(readed == -1) {
break;
}
byte[] temp = new byte[readed];
System.arraycopy(buff, 0, temp, 0, readed);
//写入文件
os.write(temp);
}
System.out.println("第"+(count++)+"张妹子:"+file.getAbsolutePath());
is.close();
os.close();
HttpClientDemo.getDownloadImgSrcs().add(new ImagePoint(imageUrl,page));
} catch (Exception e) {
e.printStackTrace();
File faultFile = new File( basePath+"/"+"fault.txt");
try {
FileOutputStream o = new FileOutputStream(faultFile,true);
o.write((imageUrl+"------"+page+"\r\n").getBytes("utf-8"));
o.close();
System.out.println("---成功把第"+page+"页下载失败的写入"+basePath+"/"+"fault.txt中---");
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
}
}
//图片URL
public class ImagePoint {
private String imageSrc;
private int page;
public String getImageSrc() {
return imageSrc;
}
public void setImageSrc(String imageSrc) {
this.imageSrc = imageSrc;
}
public int getPage() {
return page;
}
public void setPage(int page) {
this.page = page;
}
public ImagePoint(String imageSrc, int page) {
super();
this.imageSrc = imageSrc;
this.page = page;
}
}