1. 必应今日美图
当使用bing搜索时,每天都会出现一副美图。
搜索找到bing今日美图 http://bing.plmeizi.com/ (这里收集了一年多的今日美图) 收集者: http://leil.plmeizi.com/
目前共47页
url格式按 http://bing.plmeizi.com/?page=*
点进去就是我们要的名称和名称
2. 开始编码
使用简单的Jsoup进行爬虫,很简单,很好理解。
HtmlUtil
1 package util;
2
3 import java.io.IOException;
4
5 import org.jsoup.Jsoup;
6 import org.jsoup.nodes.Document;
7
8 public class HtmlUtil {
9 // 根据url从网络获取网页文本
10 public Document getHtmlTextByUrl(String url) {
11 Document doc = null;
12 try {
13 // doc = Jsoup.connect(url).timeout(5000000).get();
14 int i = (int) (Math.random() * 1000); // 做一个随机延时,防止网站屏蔽
15 while (i != 0) {
16 i--;
17 }
18 doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000)
19 .post();
20 } catch (IOException e) {
21 e.printStackTrace();
22 try {
23 doc = Jsoup.connect(url).timeout(5000000).get();
24 } catch (IOException e1) {
25 // TODO Auto-generated catch block
26 e1.printStackTrace();
27 }
28 }
29 return doc;
30 }
31 }
GetPhoto
这个编码主要需要先分析html属性,按照属性取到我们需要的元素,元素值。
我是先拿到每个图的page页面url
然后到详情页面拿到图的url,图的名字截取。
然后将图保存到本地。
1 package bing;
2
3 import java.io.DataInputStream;
4 import java.io.File;
5 import java.io.FileOutputStream;
6 import java.io.IOException;
7 import java.net.URL;
8
9 import org.jsoup.nodes.Document;
10 import org.jsoup.nodes.Element;
11 import org.jsoup.select.Elements;
12
13 import util.HtmlUtil;
14
15 /**
16 *
17 * @author loveincode
18 * @data Sep 29, 2017 1:15:00 PM
19 */
20 public class GetPhoto {
21
22 public static void go(int startpage, int endpage) throws IOException {
23
24 HtmlUtil htmlutil = new HtmlUtil();
25 // 获取图片的绝对路径
26 String url = "http://bing.plmeizi.com/?page=";
27 for (int i = startpage; i <= endpage; i++) {
28 String gourl = url + i + "";
29 Document dochtml = htmlutil.getHtmlTextByUrl(gourl);
30 Elements elements_a = dochtml.getElementsByClass("item");
31 for (int x = 0; x < elements_a.size(); x++) {
32 String pyotopage = elements_a.get(x).attr("href");
33 Document dochtml_photo = htmlutil.getHtmlTextByUrl(pyotopage);
34 Element elements_picurl = dochtml_photo.getElementById("picurl");
35 String picurl = elements_picurl.attr("href");
36 Element elements_searchlink = dochtml_photo.getElementById("searchlink");
37 String name = elements_searchlink.getElementsByTag("span").get(0).html();
38 name = name.split("\\(")[0];
39
40 if (picurl.contains("jpg")) {
41 // 下载图片
42 URL url_pic = new URL(picurl);
43 DataInputStream dataInputStream = new DataInputStream(url_pic.openStream());
44 String imageName = name + ".jpg";
45 FileOutputStream fileOutputStream = new FileOutputStream(new File("bing_pic/" + imageName));
46 byte[] buffer = new byte[1024];
47 int length;
48 while ((length = dataInputStream.read(buffer)) > 0) {
49 fileOutputStream.write(buffer, 0, length);
50 }
51 dataInputStream.close();
52 fileOutputStream.close();
53 }
54 }
55 }
56
57 }
58
59 public static void main(String[] args) throws IOException {
60 System.out.println("test");
61 go(1, 1);
62 }
63
64 }
Mythread
1 package bing;
2
3 import java.io.IOException;
4
5 public class Mythread extends Thread {
6
7 private int startpage;
8
9 private int endpage;
10
11 public Mythread(int startpage, int endpage) {
12 this.startpage = startpage;
13 this.endpage = endpage;
14 }
15
16 @SuppressWarnings("static-access")
17 @Override
18 public void run() {
19 GetPhoto getPhoto = new GetPhoto();
20 try {
21 getPhoto.go(startpage, endpage);
22 } catch (IOException e) {
23 // TODO Auto-generated catch block
24 e.printStackTrace();
25 }
26 }
27 }RUN
RUN
采用多线程,开启多个线程同时爬取图片
1 package bing;
2
3 import java.io.IOException;
4
5 /**
6 *
7 * @author loveincode
8 * @data Sep 29, 2017 1:55:57 PM
9 */
10 public class RUN {
11
12 public static void main(String[] args) throws IOException {
13
14 long startTime = System.currentTimeMillis(); // 获取开始时间
15
16 Mythread a1 = new Mythread(1, 5);
17 Mythread a2 = new Mythread(6, 10);
18 Mythread a3 = new Mythread(11, 15);
19 Mythread a4 = new Mythread(16, 20);
20 Mythread a5 = new Mythread(21, 25);
21 Mythread a6 = new Mythread(26, 30);
22 Mythread a7 = new Mythread(31, 35);
23 Mythread a8 = new Mythread(36, 40);
24 Mythread a9 = new Mythread(41, 45);
25 Mythread a10 = new Mythread(46, 47);
26
27 a1.start();
28 a2.start();
29 a3.start();
30 a4.start();
31 a5.start();
32 a6.start();
33 a7.start();
34 a8.start();
35 a9.start();
36 a10.start();
37
38 while (true) {
39 if (a1.isAlive() == false && a2.isAlive() == false && a3.isAlive() == false && a4.isAlive() == false
40 && a5.isAlive() == false && a6.isAlive() == false && a7.isAlive() == false && a8.isAlive() == false
41 && a9.isAlive() == false && a10.isAlive() == false) {
42 long endTime = System.currentTimeMillis(); // 获取结束时间
43 System.out.println("程序运行时间: " + (endTime - startTime) / 1000.0 + "s");
44 break;
45 }
46 }
47 }
48
49 }
执行 RUN
耗时76.962s 完成图片下载到本地。
成功
效果:
很高清吧