Jsoup初接触-发一个Jsoup抓取图片的程序

主要有两个线程:图片url抓取线程、图片下载保存线程。

图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。

url抓取线程:

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class GifSpider implements Runnable
{

  volatile boolean isRunning = true;
  private ThreadPoolExecutor threadPool;
  BlockingQueue<String> queue;
  
  public GifSpider(BlockingQueue<String> queue)
  {
    this.queue = queue;
    this.init();
  }
  
  /**
   * 线程池初始化
   */
  private void init()
  {
    Properties pro = PropertyUtil.getProperties();
    int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize"));
    int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize"));
    int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds"));
    int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity"));
    BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap);
    this.threadPool = new ThreadPoolExecutor(
        corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS, 
        queue);
  }
  public boolean isRunning()
  {
    return isRunning;
  }

  public void setRunning(boolean isRunning)
  {
    this.isRunning = isRunning;
  }

  @Override
  public void run()
  {
    while (this.isRunning)
    {
      try
      {
        
        String url = this.queue.take();
        System.out.println("请求url:" + url);
        Document doc = Jsoup.connect(url).get();
        //获取所有<a href>
        Elements s = doc.select("div.pic_list2").first().select("a[href]");
        for (Element e : s)
        {
          //有img 和  文字 两种href,指向相同德图片,只过滤图片href就行了
          Elements s1 = e.select("img");
          if (s1.size() != 0)
          {
            String imgUrl = e.absUrl("href");
            String text = s1.attr("alt");
            Document doc1 = Jsoup.connect(imgUrl).get();
            Elements e1 = doc1.getElementById("endtext").select("img");
            //网页源码中是相对路径,要获取绝对路径
            String realUrl = e1.attr("abs:src");
            System.out.println("获取图片url:" + realUrl);
            //获取到图片url,扔给线程池处理
            GifProcessor pro = new GifProcessor(text,realUrl);
            this.threadPool.execute(pro);
          }
          
        }
        Thread.sleep(1000);
      } catch (InterruptedException e)
      {
        e.printStackTrace();
      } catch (IOException e)
      {
        e.printStackTrace();
      }
    }
    
  }
  
}

图片处理线程很简单,就是图片下载和保存:
package sys.gifspider;

import sys.gifspider.utils.FileProcessor;

public class GifProcessor implements Runnable
{

  private String imgName;
  private String imgUrl;
  
  public GifProcessor(String name,String url)
  {
    this.imgName = name;
    this.imgUrl = url;
  }
  @Override
  public void run()
  {
    FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl);
    try
    {
      System.out.println("下载保存图片url:"+this.imgUrl);
      fp.saveGif();
      
    }catch(Exception e)
    {
      System.out.println("下载保存图片失败,url:"+this.imgUrl);
      e.printStackTrace();
    }
    
  }
  
}


下载保存:

package sys.gifspider.utils;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class FileProcessor
{
  private String imgName;
  private String imgUrl;
  
  public FileProcessor(String name,String url)
  {
    this.imgName = name;
    this.imgUrl = url;
  }
  
  /**
   * 保存路径,不存在就创建
   * @return
   */
  private String makeDir()
  {
    String strdir = PropertyUtil.getProperties().getProperty("dir");
    File dir = new File(strdir);
    if (!dir.exists())
    {
      dir.mkdir();
    }
    return strdir;
  }
  
  /**
   * 保存
   * @throws Exception
   */
  public void saveGif() throws Exception
  {
    String dir = makeDir();
    String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf("."));
    BufferedOutputStream out = null;
    byte[] bit = this.download();
    if (bit.length > 0)
    {
      try
      {
        out = new BufferedOutputStream(new FileOutputStream(file));
        out.write(bit);
        out.flush();
      } finally
      {
        if (out != null)
          out.close();
      }
    }
  }
  /**
   * 下载
   * @return
   * @throws Exception
   */
  private byte[] download() throws Exception  
  {
    URL url = new URL(this.imgUrl);  
    HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();  
    httpConn.connect();  
    InputStream cin = httpConn.getInputStream();  
    ByteArrayOutputStream outStream = new ByteArrayOutputStream();  
    byte[] buffer = new byte[1024];  
    int len = 0;  
    while ((len = cin.read(buffer)) != -1) {  
        outStream.write(buffer, 0, len);  
    }  
    cin.close();  
    byte[] fileData = outStream.toByteArray();  
    outStream.close();  
    return fileData;  
  }
}

程序入口如下:

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class Main
{
  public static void main(String[] args)
  {
    init();
    
  }
  public static void init()
  {
    Properties pro = PropertyUtil.getProperties();
    int startPage = Integer.parseInt(pro.getProperty("startPage"));
    int endPage = Integer.parseInt(pro.getProperty("endPage"));
    String url = pro.getProperty("url");
    int count = endPage - startPage +1;
    BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count);
    for (int i = 1; i <= count; i++)
    {
      queue.add(String.format(url, i));
    }
    int spiderCount = Integer.parseInt(pro.getProperty("spiderThread"));
    for (int i = 0; i < spiderCount; i++)
    {
      GifSpider spider = new GifSpider(queue);
      Thread t = new Thread(spider);
      t.start();
    }
  }
  
}

配置文件:

spiderThread=1

threadpool.corePoolSize=8
threadpool.maxPoolSize=10
threadpool.keepAliveSeconds=600
threadpool.queueCapacity=1000

startPage=1
endPage=20
url=http://www.haha365.com/gxtp/index_gif_%d.htm

dir=E:/spider/


用haha365的动态gif做了下测试,如果想趴别的网站,自己根据人家的html结构,改一下爬取规则即可。

程序中没做过多的容错处理,可能存在一定的bug。


源码下载


  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

失足成万古风流人物

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值