Jsoup初接触-发一个Jsoup抓取图片的程序

最新推荐文章于 2023-05-23 10:39:34 发布

失足成万古风流人物

最新推荐文章于 2023-05-23 10:39:34 发布

阅读量2.6k

点赞数 1

分类专栏： javaWeb

本文链接：https://blog.csdn.net/blogtime/article/details/17138019

版权

javaWeb 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

主要有两个线程：图片url抓取线程、图片下载保存线程。

图片下载保存采用线程池处理，主要利用java的ThreadPoolExecutor实现。

url抓取线程：

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class GifSpider implements Runnable
{

  volatile boolean isRunning = true;
  private ThreadPoolExecutor threadPool;
  BlockingQueue<String> queue;
  
  public GifSpider(BlockingQueue<String> queue)
  {
    this.queue = queue;
    this.init();
  }
  
  /**
   * 线程池初始化
   */
  private void init()
  {
    Properties pro = PropertyUtil.getProperties();
    int corePoolSize = Integer.parseInt(pro.getProperty("threadpool.corePoolSize"));
    int maxPoolSize = Integer.parseInt(pro.getProperty("threadpool.maxPoolSize"));
    int keepAliveSeconds = Integer.parseInt(pro.getProperty("threadpool.keepAliveSeconds"));
    int queueCap = Integer.parseInt(pro.getProperty("threadpool.queueCapacity"));
    BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(queueCap);
    this.threadPool = new ThreadPoolExecutor(
        corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS, 
        queue);
  }
  public boolean isRunning()
  {
    return isRunning;
  }

  public void setRunning(boolean isRunning)
  {
    this.isRunning = isRunning;
  }

  @Override
  public void run()
  {
    while (this.isRunning)
    {
      try
      {
        
        String url = this.queue.take();
        System.out.println("请求url：" + url);
        Document doc = Jsoup.connect(url).get();
        //获取所有<a href>
        Elements s = doc.select("div.pic_list2").first().select("a[href]");
        for (Element e : s)
        {
          //有img 和  文字 两种href，指向相同德图片，只过滤图片href就行了
          Elements s1 = e.select("img");
          if (s1.size() != 0)
          {
            String imgUrl = e.absUrl("href");
            String text = s1.attr("alt");
            Document doc1 = Jsoup.connect(imgUrl).get();
            Elements e1 = doc1.getElementById("endtext").select("img");
            //网页源码中是相对路径，要获取绝对路径
            String realUrl = e1.attr("abs:src");
            System.out.println("获取图片url：" + realUrl);
            //获取到图片url，扔给线程池处理
            GifProcessor pro = new GifProcessor(text,realUrl);
            this.threadPool.execute(pro);
          }
          
        }
        Thread.sleep(1000);
      } catch (InterruptedException e)
      {
        e.printStackTrace();
      } catch (IOException e)
      {
        e.printStackTrace();
      }
    }
    
  }
  
}

图片处理线程很简单，就是图片下载和保存：

package sys.gifspider;

import sys.gifspider.utils.FileProcessor;

public class GifProcessor implements Runnable
{

  private String imgName;
  private String imgUrl;
  
  public GifProcessor(String name,String url)
  {
    this.imgName = name;
    this.imgUrl = url;
  }
  @Override
  public void run()
  {
    FileProcessor fp = new FileProcessor(this.imgName,this.imgUrl);
    try
    {
      System.out.println("下载保存图片url："+this.imgUrl);
      fp.saveGif();
      
    }catch(Exception e)
    {
      System.out.println("下载保存图片失败，url："+this.imgUrl);
      e.printStackTrace();
    }
    
  }
  
}

下载保存：

package sys.gifspider.utils;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class FileProcessor
{
  private String imgName;
  private String imgUrl;
  
  public FileProcessor(String name,String url)
  {
    this.imgName = name;
    this.imgUrl = url;
  }
  
  /**
   * 保存路径，不存在就创建
   * @return
   */
  private String makeDir()
  {
    String strdir = PropertyUtil.getProperties().getProperty("dir");
    File dir = new File(strdir);
    if (!dir.exists())
    {
      dir.mkdir();
    }
    return strdir;
  }
  
  /**
   * 保存
   * @throws Exception
   */
  public void saveGif() throws Exception
  {
    String dir = makeDir();
    String file = dir + this.imgName + this.imgUrl.substring(this.imgUrl.lastIndexOf("."));
    BufferedOutputStream out = null;
    byte[] bit = this.download();
    if (bit.length > 0)
    {
      try
      {
        out = new BufferedOutputStream(new FileOutputStream(file));
        out.write(bit);
        out.flush();
      } finally
      {
        if (out != null)
          out.close();
      }
    }
  }
  /**
   * 下载
   * @return
   * @throws Exception
   */
  private byte[] download() throws Exception  
  {
    URL url = new URL(this.imgUrl);  
    HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();  
    httpConn.connect();  
    InputStream cin = httpConn.getInputStream();  
    ByteArrayOutputStream outStream = new ByteArrayOutputStream();  
    byte[] buffer = new byte[1024];  
    int len = 0;  
    while ((len = cin.read(buffer)) != -1) {  
        outStream.write(buffer, 0, len);  
    }  
    cin.close();  
    byte[] fileData = outStream.toByteArray();  
    outStream.close();  
    return fileData;  
  }
}

程序入口如下：

package sys.gifspider;

import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import sys.gifspider.utils.PropertyUtil;

public class Main
{
  public static void main(String[] args)
  {
    init();
    
  }
  public static void init()
  {
    Properties pro = PropertyUtil.getProperties();
    int startPage = Integer.parseInt(pro.getProperty("startPage"));
    int endPage = Integer.parseInt(pro.getProperty("endPage"));
    String url = pro.getProperty("url");
    int count = endPage - startPage +1;
    BlockingQueue<String> queue = new LinkedBlockingQueue<String>(count);
    for (int i = 1; i <= count; i++)
    {
      queue.add(String.format(url, i));
    }
    int spiderCount = Integer.parseInt(pro.getProperty("spiderThread"));
    for (int i = 0; i < spiderCount; i++)
    {
      GifSpider spider = new GifSpider(queue);
      Thread t = new Thread(spider);
      t.start();
    }
  }
  
}

配置文件：

spiderThread=1

threadpool.corePoolSize=8
threadpool.maxPoolSize=10
threadpool.keepAliveSeconds=600
threadpool.queueCapacity=1000

startPage=1
endPage=20
url=http://www.haha365.com/gxtp/index_gif_%d.htm

dir=E:/spider/

用haha365的动态gif做了下测试，如果想趴别的网站，自己根据人家的html结构，改一下爬取规则即可。

程序中没做过多的容错处理，可能存在一定的bug。

源码下载

失足成万古风流人物

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
0
评论
Jsoup初接触-发一个Jsoup抓取图片的程序

主要有两个线程：图片url抓取线程、图片下载保存线程。图片下载保存采用线程池处理，主要利用java的ThreadPoolExecutor实现。url抓取线程：package sys.gifspider;import java.io.IOException;import java.util.Properties;import java.util.concurrent.Bloc
复制链接

扫一扫