抓取网页img标签
package com.betawoo.portal.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Created by hgg on 2017/9/2.
*/
public class spiderUtils {
public static void getHtmlWithUrl(String url) {
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36").get();
// System.out.println(doc);
spiderUtils.getImagesWithHtml(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void getImagesWithHtml(Document doc) {
Elements images = doc.getElementsByTag("img");
for (Element image : images) {
//页面所有img
System.out.println(image.attr("src"));
}
}
public static void main(String[] args) {
//抓取网址
spiderUtils.getHtmlWithUrl("https://zhidao.baidu.com/question/453277368.html");
}
}
引入jar
<!--jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>