主要有两个线程:图片url抓取线程、图片下载保存线程。
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
url抓取线程:
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class GifSpider implements Runnable
{
volatile boolean isRunning = true;
private ThreadPoolExecutor threadPool;
BlockingQueue<String> queue;
public GifSpider(BlockingQueue<String> queue)
{
this.queue = queue;
this.init();
}
/**
* 线程池初始化
*/
private void init()
{
Properties pro = PropertyUtil.getProperties();
int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize"));
int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize"));
int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds"));
int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity"));
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap);
this.threadPool = new ThreadPoolExecutor(
corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS,
queue);
}
public boolean isRunning()
{
return isRunning;
}
public void setRunning(boolean isRunning)
{
this.isRunning = isRunning;
}
@Override
public void run()
{
while (this.isRunning)
{
try
{
String url = this.queue.take();
System.out.println("请求url:" + url);
Document doc = Jsoup.connect(url).get();
//获取所有<a href>
Elements s = doc.select("div.pic_list2").first().select("a[href]");
for (Element e : s)
{
//有img 和 文字 两种href,指向相同德图片,只过滤图片href就行了
Elements s1 = e.select("img");
if (s1.size() != 0)
{
String imgUrl = e.absUrl("href");
String text = s1.attr("alt");
Document doc1 = Jsoup.connect(imgUrl).get();
Elements e1 = doc1.getElementById("endtext").select("img");
//网页源码中是相对路径,要获取绝对路径
String realUrl = e1.attr("abs:src");
System.out.println("获取图片url:" + realUrl);
//获取到图片url,扔给线程池处理
GifProcessor pro = new GifProcessor(text,realUrl);
this.threadPool.execute(pro);
}
}
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}
}
}
图片处理线程很简单,就是图片下载和保存:
package sys.gifspider;
import sys.gifspider.utils.FileProcessor;
public class GifProcessor implements Runnable
{
private String imgName;
private String imgUrl;
public GifProcessor(String name,String url)
{
this.imgName = name;
this.imgUrl = url;
}
@Override
public void run()
{
FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl);
try
{
System.out.println("下载保存图片url:"+this.imgUrl);
fp.saveGif();
}catch(Exception e)
{
System.out.println("下载保存图片失败,url:"+this.imgUrl);
e.printStackTrace();
}
}
}
package sys.gifspider.utils;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class FileProcessor
{
private String imgName;
private String imgUrl;
public FileProcessor(String name,String url)
{
this.imgName = name;
this.imgUrl = url;
}
/**
* 保存路径,不存在就创建
* @return
*/
private String makeDir()
{
String strdir = PropertyUtil.getProperties().getProperty("dir");
File dir = new File(strdir);
if (!dir.exists())
{
dir.mkdir();
}
return strdir;
}
/**
* 保存
* @throws Exception
*/
public void saveGif() throws Exception
{
String dir = makeDir();
String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf("."));
BufferedOutputStream out = null;
byte[] bit = this.download();
if (bit.length > 0)
{
try
{
out = new BufferedOutputStream(new FileOutputStream(file));
out.write(bit);
out.flush();
} finally
{
if (out != null)
out.close();
}
}
}
/**
* 下载
* @return
* @throws Exception
*/
private byte[] download() throws Exception
{
URL url = new URL(this.imgUrl);
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
httpConn.connect();
InputStream cin = httpConn.getInputStream();
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len = 0;
while ((len = cin.read(buffer)) != -1) {
outStream.write(buffer, 0, len);
}
cin.close();
byte[] fileData = outStream.toByteArray();
outStream.close();
return fileData;
}
}
程序入口如下:
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class Main
{
public static void main(String[] args)
{
init();
}
public static void init()
{
Properties pro = PropertyUtil.getProperties();
int startPage = Integer.parseInt(pro.getProperty("startPage"));
int endPage = Integer.parseInt(pro.getProperty("endPage"));
String url = pro.getProperty("url");
int count = endPage - startPage +1;
BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count);
for (int i = 1; i <= count; i++)
{
queue.add(String.format(url, i));
}
int spiderCount = Integer.parseInt(pro.getProperty("spiderThread"));
for (int i = 0; i < spiderCount; i++)
{
GifSpider spider = new GifSpider(queue);
Thread t = new Thread(spider);
t.start();
}
}
}
配置文件:
spiderThread=1
threadpool.corePoolSize=8
threadpool.maxPoolSize=10
threadpool.keepAliveSeconds=600
threadpool.queueCapacity=1000
startPage=1
endPage=20
url=http://www.haha365.com/gxtp/index_gif_%d.htm
dir=E:/spider/
用haha365的动态gif做了下测试,如果想趴别的网站,自己根据人家的html结构,改一下爬取规则即可。
程序中没做过多的容错处理,可能存在一定的bug。