自己无聊做了一个简单爬虫:
依赖:
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
源码:
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import static org.jsoup.Jsoup.connect;
/**
* 爬虫爬取图片
*/
public class Picture {
public static void main(String[] args) throws IOException {
int max = 10;//网站页码
String url; //观察网站页码规律编辑
for (int j = 1; j < max ;j++){
if(j==1){
url = "http://www.******.com/tupian/"; //我用的网址不能发可以找我要
}else{
url = "http://www.******.com/tupian/index_"+j+".html";
}
//获取连接
Connection connect = connect(url);
//网页内容
Document document = connect.get();
//通过class属性获取页面元素
Elements titles = document.getElementsByClass("lazy");
System.out.println("ElementsCount:"+titles.size());
List<String> urls = new ArrayList<String>();
for (int i = 0; i < titles.size(); i++) {
System.out.println("src=="+titles.get(i).attr("src"));
//获取src属性内容
urls.add(titles.get(i).attr("src"));
}
int a = 1;
//文件存储位置
File file = new File("F:\\pic\\"+new Date().getTime());
if(!file.exists()){
file.mkdirs();
}
for (String prcUrl: urls) {
String geshi = prcUrl.substring(prcUrl.lastIndexOf("."));//获取图片格式
Connection.Response execute = connect(prcUrl).ignoreContentType(true).execute();//获取图片
FileOutputStream out = (new FileOutputStream(new java.io.File(file,a+geshi)));//设置输出流
out.write(execute.bodyAsBytes());
out.close();
a++;
}
}
}
}