楔子
jsoup抓取图片,其实就是分析页面图片地址,然后下载图片
demo
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @Title: DownPic.java
* @Package com.pic
* @Description: TODO(用一句话描述该文件做什么)
* @author 作者 grq
* @version 创建时间:2018年12月2日 下午9:24:03
*
*/
public class DownPic {
static String mainUrl = "https://www.meitulu.com/t/toutiaonvshen/";
File saveFile = new File("c://picc");
public static void main(String[] args) throws IOException {
for (int i = 1; i < 10; i++) {
String pageUrl = "";
final int pageNum = i;
if (i == 1) {
pageUrl = mainUrl;
} else {
pageUrl = mainUrl + i + ".html";
}
parseMinPage(pageUrl, "page_" + pageNum);
}
}
private static void parseMinPage(String url, String pageNum) {
try {
Document document = Jsoup.connect(url).get();
Elements imgLis = document.getElementsByClass("img");
// 从li中获取第一个a标签
Elements elementsByTag = Jsoup.parse(imgLis.toString()).getElementsByTag("li");
for (Element element : elementsByTag) {
Element child = element.child(0);
Elements allElements = child.getAllElements();
String picURL = allElements.get(0).attr("href");
String attr = allElements.get(1).attr("alt");
// 图片 数量
String picCount = element.child(1).text().substring("图片: ".length(), "图片: ".length() + 2);
downDetail(picURL, pageNum + "/" + attr.replaceAll(" ", ""), picCount.trim());
System.out.println("down pic is " + attr + " 地址是:" + picURL);
}
} catch (IOException e) {
System.out.println("主页连接超时");
e.printStackTrace();
}
}
/**
* 在主页下载图片
*
* @param picURL
* @param title
* @param picCount
* @throws IOException
*/
private static void downDetail(String picURL, String title, String picCount) throws IOException {
String baseUrl = picURL.substring(0, picURL.length() - 5);
for (int i = 1; i <= (Integer.valueOf(picCount) + 3) / 4; i++) {
if (i == 1) {
picURL = baseUrl + ".html";
} else {
picURL = baseUrl + "_" + i + ".html";
}
Elements pics = Jsoup.connect(picURL).get().getElementsByClass("content");
Elements picImg = pics.get(0).getElementsByTag("img");
for (Element ele : picImg) {
downPic(ele.attr("src"), title);
}
}
}
private static void downPic(String picUrl, String title) throws IOException {
// https://www.meitulu.com/t/toutiaonvshen/
Connection connect = Jsoup.connect(picUrl);
Response execute = connect.referrer("https://www.meitulu.com/").ignoreContentType(true).execute();
String name = FilenameUtils.getName(picUrl);
BufferedInputStream bodyStream = execute.bodyStream();
FileUtils.copyInputStreamToFile(bodyStream, new File("c:/piccc/" + title, name));
}
}
pom
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>