直接上代码:
import java.io.IOException;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Test {
public static void main(String[] args) throws IOException {
ArrayList<String> imageUrlBuilder = new ArrayList<String>();
ArrayList<String> httpUrlBuilder = new ArrayList<String>();
httpUrlBuilder.add("http://news.baidu.com/");
int i = 0;
String url = null;
while((url = httpUrlBuilder.get(i))!= null){
try{
Document doc = Jsoup.connect(url).get();
System.out.println("==============当前url"+url+"下有图片链接===============");
Elements imgLinks = doc.getElementsByTag("img");
for (Element link : imgLinks) {
String linkHref = link.attr("src");
if(linkHref.startsWith("http")){
imageUrlBuilder.add(linkHref);
System.out.println(linkHref);
}
}
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
//如果数组中含有此链接字符串就不添加
if(linkHref.startsWith("http")&&!httpUrlBuilder.contains(linkHref)){
httpUrlBuilder.add(linkHref);
}
}
}catch(Exception e){
continue;
}
System.out.println();
System.out.println("httpUrl数目"+httpUrlBuilder.size());
i++;
}
}
}
第三方包下载地址:http://jsoup.org/download