Crawler使用Jsoup下载pixi的图片

最新推荐文章于 2023-05-21 15:42:55 发布

一矢光明

最新推荐文章于 2023-05-21 15:42:55 发布

阅读量402

点赞数 1

分类专栏： JAVA

本文链接：https://blog.csdn.net/u010129379/article/details/53409312

版权

JAVA 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

public class Jsoupee {

   // API参考http://www.open-open.com/jsoup/
   public static void main(String[] args) throws Exception {

       // Document doc =
       // Jsoup.connect("http://www.pixivision.net/zh/a/1600").get();
       // http://www.pixivision.net/zh/a/1868
       Document doc = Jsoup.connect("http://www.pixivision.net/zh/a/1846").get();
       Elements imgs = doc.getElementsByClass("am__work__illust ");
       int m = 0;
       for (Element img : imgs) {
           System.out.println(img.absUrl("src"));
           String url = img.absUrl("src");
           String fileName = "google" + m + ".jpg";
           m++;
           StoreFromNet2 net2 = new StoreFromNet2();
           File ee = new File("D:/downloadimg/" + fileName);
           net2.download(url, ee);
       }
       System.out.println(imgs.size());
       System.out.println("抓去结束");

       // 抓取谷歌pixi上的图片
       Jsoupee ee = new Jsoupee();
       ee.从goolepix上下图片();
   }

   /**
   * 从googlepixi上抓取图片
   *
   * @throws IOException
   */
   private void 从goolepix上下图片() throws IOException {
       // ㈠字符串解析
       // String html = "<html><head><title>First
       // parse</title></head><body><p>Parsed HTML into a
       // doc.</p></body></html>";
       // Document doc = Jsoup.parse(html);

       // ㈡从网页链接解析
       // Document doc=Jsoup.connect("http://www.pixiv.net/").get();
       // String title=doc.title();

       // ㈢从本地文件中解析
       // File input = new File("/tmp/input.html");
       // Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");

       Document doc = Jsoup.connect("https://plus.google.com/+pixiv").get();
       // Element body=doc.body();
       Elements imgs = doc.getElementsByClass("JZUAbb");
       int m = 0;
       for (Element img : imgs) {
           System.out.println(img.absUrl("src"));
           // URL url = new URL(img.absUrl("src"));
           String url = img.absUrl("src");
           StoreFromNet png = new StoreFromNet();
           byte[] btImg = png.getImageFromNetByUrl(url);
           // byte[] btImg=getImageFromNetByUrl(url);
           if (null != btImg && btImg.length > 0) {
               System.out.println("读取到：" + btImg.length + "字节");
               String fileName = "google" + m + ".jpg";
               m++;
               png.writeImageToDisk(btImg, fileName);
           } else {
               System.out.println("没有从该链接获得内容");
           }
           // if(img.hasAttr("src")){
           // System.out.println("有src标签");
           // }
       }

       System.out.println(imgs.size());
       System.out.println("抓去结束");
   }

}

public class StoreFromNet2 {
   private OutputStream os;

   /**
   *
   * @param imgurl
   *            下载链接
   * @param f
   *            下载的图片文件
   * @return 返回文件，失败的时候返回null
   * @throws Exception
   *             返回错误
   */
   public File download(String imgurl, File f) throws Exception {
       URL url = new URL(imgurl);
       URLConnection con = url.openConnection();
       int index = imgurl.indexOf("/", 10);
       con.setRequestProperty("Host", index == -1 ? imgurl.substring(7) : imgurl.substring(7, index));
       con.setRequestProperty("Referer", imgurl);
       InputStream is = con.getInputStream();
       if (con.getContentEncoding() != null && con.getContentEncoding().equalsIgnoreCase("gzip")) {
           is = new GZIPInputStream(con.getInputStream());
       }
       byte[] bs = new byte[1024];
       int len = -1;
       os = new FileOutputStream(f);
       while ((len = is.read(bs)) != -1) {
           os.write(bs, 0, len);
       }
       os.flush();
       os.close();
       return f;
   }
}

一矢光明

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Crawler使用Jsoup下载pixi的图片

public class Jsoupee { // API参考http://www.open-open.com/jsoup/ public static void main(String[] args) throws Exception { // Document doc = // Jsoup.connect("http://www.
复制链接

扫一扫

专栏目录