java爬虫

最新推荐文章于 2023-04-25 09:01:18 发布

小阳世界2023

最新推荐文章于 2023-04-25 09:01:18 发布

阅读量329

点赞数

分类专栏： Java基础题

本文链接：https://blog.csdn.net/djfgduyhgfu/article/details/53556983

版权

Java基础题专栏收录该内容

28 篇文章 0 订阅

订阅专栏

package com.npk.test;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
* Created by Blank on 2017/1/11 17:32
*/

public class CrawlerPicture {
//public final static int STARTPAGE = 65600;
//public final static int ENDPAGE = 65620;// 爬取的页面数量
public final static int STARTPAGE = 70300;
public final static int ENDPAGE = 70500;// 爬取的页面数量
/**
* 获取图片的src和alt属性值
*
* @return
* @throws IOException
*/
public static List<Picture> getPictureUrl() throws IOException {
int number = 1;
List<Picture> pics = new ArrayList<Picture>();// 存储获取到的所有图片的URL地址
for (int i = STARTPAGE; i < ENDPAGE; i++) {
System.out.println(i);

String url = "http://meinv" + i + ".html";
Document doc = null;
doc = Jsoup.connect(url).get(); // 获取页面文档
Elements divList = doc.body().select("div.detail-con");
if (divList.size() == 0) {
System.out.println("第" + i + "组无图");
continue;
}
Element liList = doc.body().select("li").first();
if (liList != null) {
// System.out.println("值是"+liList.text());
String[] num = liList.text().split("/");
// int num1= liList.text().split("/").length;
System.out.println("共有" + num[1] + "张");
// System.out.println(divList.size());
for (int ci = 1; ci <= Integer.parseInt(num[1]); ci++) {
if (ci == 1) {
url = "http://m.qqba.com/meinv_" + i + ".html";
} else {
url = "http://m.qqba.com/meinv_" + i + "_" + ci + ".html";
}
System.out.println("第" + i + "组图" + "第" + ci + "张");
doc = Jsoup.connect(url).get();
divList = doc.body().select("div.detail-con");
for (int j = 0; j < divList.size(); j++) {
Elements imgList = divList.get(j).select("img");// 一个网页内所有的img标签
// System.out.println(imgList.size());
for (int k = 0; k < imgList.size(); k++) {
Picture pic = new Picture();
pic.setId(number++);
pic.setSrc(imgList.get(k).attr("src"));
pic.setAlt(imgList.get(k).attr("alt"));
pics.add(pic);
}
}
}
}
}

return pics;
}

/**
* 获取图片输入流
*
* @param picUrl
* 图片的URL地址
* @return
* @throws IOException
*/
public static InputStream getPictureInputStream(String picUrl) throws IOException {
URL url = new URL(picUrl);
DataInputStream dis = new DataInputStream(url.openStream());// 获取图片的输入流
return dis;
}

/**
* 保存图片到本地磁盘中
*
* @param
* @throws IOException
*/
public static void savePicture(InputStream in, Picture pic) throws IOException {
String newImgUrl = "E:/picture/" + pic.getAlt().substring(6, 8) + "--" + pic.getId() + ".jpg";// 图片在磁盘上的存储路径
FileOutputStream fos = new FileOutputStream(new File(newImgUrl));
byte[] buf = new byte[1024];
int len = -1;
while ((len = in.read(buf)) > 0) {
fos.write(buf, 0, len);
}
fos.close();
}

/**
* 测试
*
* @param args
*/
public static void main(String[] args) {
try {
List<Picture> pics = getPictureUrl();
System.out.println("图片正在下载...");
for (int i = 0; i < pics.size(); i++) {
Picture pic = pics.get(i);
String picUrl = pic.getSrc();
InputStream in = getPictureInputStream(picUrl);
savePicture(in, pic);
in.close();
}
System.out.println("下载完成！");
} catch (IOException e) {
e.printStackTrace();
}
}

}