楔子
学习jsoup ,抓取图片作为练习
更多jsoup 参考https://jsoup.org/ 。英语不好的(说的就是我) http://www.open-open.com/jsoup/
注意
网页有防盗链
如果没有防盗链FileUtils.copyURLToFile(url, new File("c://meizitu.jpg"));
就可以下载图片
防盗链
有些防盗链是采用
Referer
实现的。针对此可以简单处理
String src = "http://img.mmjpg.com/2018/1305/1in1.jpg";
URL url = new URL(src);
URLConnection con = url.openConnection();
con.setConnectTimeout(5 * 1000);
String referer = url.getProtocol() + "://" + url.getHost();
con.setRequestProperty("Referer", referer);
System.out.println(referer);
FileUtils.copyInputStreamToFile(con.getInputStream(), new File("c://2.jpg"));或者采用下面的
String referer = url.getProtocol() + "://" + url.getHost();
Connection ref = Jsoup.connect(url.toString()).ignoreContentType(true).referrer(referer);
Response execute = ref.execute();
BufferedInputStream bodyStream = execute.bodyStream();
FileUtils.copyInputStreamToFile(bodyStream, new File("c://1.jpg"));这部分可以参考 https://stackoverflow.com/questions/13558448/jsoup-http-error-fetching-url
https://blog.csdn.net/lf_breeze/article/details/51862107
https://blog.csdn.net/u013123635/article/details/78447440
抓取图片
下面代码针对无需登录实现的
抓取图片,主要是2步骤:1是找到图片URL,2是下载图片
在 找图片URL过程繁琐一点。不过多数网页图片是固定的模版。
1:找URL
![]()
如图。图片在 id 为 pins 的ul 里面,依次找到 每个li,找到 每种图片的 url,然后在根据此url,继续找具体的 图片地址
![]()
根据上面的 地址,就可看到 图片的具体url.然后下载就行。
图片首页信息
package cn.zuzi.mzitu;
public class PageBean {
/**
* 网站首页
*/
public static String baseUrl="http://www.mzitu.com/";
/**
* 每页中 图片列表 div id
*/
public static String pageIdString ="pins";
/**
* 图片 发布时间
*/
public static String todayClassString="time";
}
构造图片实体类 此处省略 get set方法
package cn.zuzi.mzitu;
public class PicBean {
static String picImgPostion = "main-image";
/**
* 分页位置
*/
public static String picNumClass = "pagenavi";
/**
* 图片个数
*/
private Integer picNum;
/**
* 图片标题
*/
private String title;
/**
* 图片 base url
*/
private String picBaseUrl;
private String picPubDate;
}
下载需要的方法
package cn.zuzi.mzitu.utils;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.select.Elements;
public class PicUtils {
public static final String encoding = "UTF-8";
public static final String SAVE_FILE = "d:/picc";
/**
* 获取 图片 页码数
* @param content
* @return
*/
public static int getPicNum(String content) {
Elements byTag = Jsoup.parse(content).getElementsByTag("a");
String text = byTag.get(byTag.size() - 2).text();
return Integer.parseInt(text);
}
public static void downPic(String urlStr, String fileName) {
try {
URL url = new URL(urlStr);
String referer = url.getProtocol() + "://" + url.getHost();
Connection ref = Jsoup.connect(url.toString()).ignoreContentType(true).referrer(referer);
Response execute;
execute = ref.execute();
BufferedInputStream bodyStream = execute.bodyStream();
FileUtils.copyInputStreamToFile(bodyStream, new File(SAVE_FILE,fileName));
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取文件后缀名
* @param fileStr
* @return
*/
public static String getFieExtName(String fileStr){
return fileStr.substring(fileStr.lastIndexOf("."));
}
}
下载具体 图片
package cn.zuzi.mzitu.utils;
public class PicDownThread extends Thread {
private String title;
private String picPubDate;
private String picdetailUrl;
private int serial;
public PicDownThread(String title, String picPubDate, String picdetailUrl, int serial) {
this.title = title;
this.picPubDate = picPubDate;
this.picdetailUrl = picdetailUrl;
this.serial = serial;
}
@Override
public void run() {
PicUtils.downPic(picdetailUrl, picPubDate + "/" + title + "/" + serial + PicUtils.getFieExtName(picdetailUrl));
}
}
通过首页进入各个页面 查找需要信息
package cn.zuzi.mzitu;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import cn.zuzi.mzitu.utils.PicDownThread;
import cn.zuzi.mzitu.utils.PicUtils;
public class DownPicThread extends Thread {
public PicBean picbean;
public DownPicThread(PicBean picBean2) {
this.picbean = picBean2;
}
@Override
public void run() {
String picBaseUrl = picbean.getPicBaseUrl();
Document document;
try {
document = Jsoup.connect(picBaseUrl).get();
String first_pic_src = getPicDetailUrl(document);
String pageNumSti = document.getElementsByClass(picbean.picNumClass).get(0).toString();
picbean.setPicNum(PicUtils.getPicNum(pageNumSti));
// 下载第一张图片
File file = new File(PicUtils.SAVE_FILE, picbean.getPicPubDate() + "/" + picbean.getTitle() + "/");
if (!file.exists()) {
file.mkdirs();
}
new PicDownThread(picbean.getTitle(), picbean.getPicPubDate(), first_pic_src, 1).start();
// 下载其余图片
for (int i = 1; i <= picbean.getPicNum(); i++) {
Document docume = Jsoup.connect(picBaseUrl + "/" + i).get();
String picDetailUrl = getPicDetailUrl(docume);
new PicDownThread(picbean.getTitle(), picbean.getPicPubDate(), picDetailUrl, i + 1).start();
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取 具体URL
*
* @param document
* @return
*/
private String getPicDetailUrl(Document document) {
return document.getElementsByClass(picbean.picImgPostion).get(0).getElementsByTag("img").get(0).attr("src");
}
}
下载采集开始入口
package cn.zuzi.mzitu;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.joda.time.DateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class CollctPic {
public static void main(String[] args) {
DateTime dateTime = new DateTime();
dateTime.toString("yyyy-MM-dd");
List<PicBean> picBeanList = getPicList("2018-03-31");
downPic(picBeanList);
}
/**
* 下载图片
*
* @param picBeanList
*/
private static void downPic(List<PicBean> picBeanList) {
for (PicBean picBean : picBeanList) {
new DownPicThread(picBean).start();
}
}
/**
* 获取采集的地址
*
* @return
*/
private static List<PicBean> getPicList(String dateStr) {
List<PicBean> list = new ArrayList<PicBean>();
try {
Document document = Jsoup.connect(PageBean.baseUrl).get();
Element picUl = document.getElementById(PageBean.pageIdString);
Elements picAndTime = picUl.getElementsByTag("li");
for (Element element : picAndTime) {
PicBean picBean = new PicBean();
Elements spanELl = element.getElementsByTag("span");
String attr = spanELl.get(0).getElementsByTag("a").get(0).attr("href");
String title = spanELl.get(0).text();
String date = spanELl.get(1).text();
if (!dateStr.equals(date)) {
//获取首页的图片
// break;
}
picBean.setPicPubDate(date);
picBean.setTitle(title);
picBean.setPicBaseUrl(attr);
list.add(picBean);
}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}
}