java爬虫

package com.npk.test;


import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


/**
 * Created by Blank on 2017/1/11 17:32
 */


public class CrawlerPicture {
//public final static int STARTPAGE = 65600;
//public final static int ENDPAGE = 65620;// 爬取的页面数量
public final static int STARTPAGE = 70300;
public final static int ENDPAGE = 70500;// 爬取的页面数量
/**
* 获取图片的src和alt属性值
*
* @return
* @throws IOException
*/
public static List<Picture> getPictureUrl() throws IOException {
int number = 1;
List<Picture> pics = new ArrayList<Picture>();// 存储获取到的所有图片的URL地址
for (int i = STARTPAGE; i < ENDPAGE; i++) {
System.out.println(i);

String url = "http://meinv" + i + ".html";
Document doc = null;
doc = Jsoup.connect(url).get(); // 获取页面文档
Elements divList = doc.body().select("div.detail-con");
if (divList.size() == 0) {
System.out.println("第" + i + "组无图");
continue;
}
Element liList = doc.body().select("li").first();
if (liList != null) {
// System.out.println("值是"+liList.text());
String[] num = liList.text().split("/");
// int num1= liList.text().split("/").length;
System.out.println("共有" + num[1] + "张");
// System.out.println(divList.size());
for (int ci = 1; ci <= Integer.parseInt(num[1]); ci++) {
if (ci == 1) {
url = "http://m.qqba.com/meinv_" + i + ".html";
} else {
url = "http://m.qqba.com/meinv_" + i + "_" + ci + ".html";
}
System.out.println("第" + i + "组图" + "第" + ci + "张");
doc = Jsoup.connect(url).get();
divList = doc.body().select("div.detail-con");
for (int j = 0; j < divList.size(); j++) {
Elements imgList = divList.get(j).select("img");// 一个网页内所有的img标签
// System.out.println(imgList.size());
for (int k = 0; k < imgList.size(); k++) {
Picture pic = new Picture();
pic.setId(number++);
pic.setSrc(imgList.get(k).attr("src"));
pic.setAlt(imgList.get(k).attr("alt"));
pics.add(pic);
}
}
}
}
}


return pics;
}


/**
* 获取图片输入流
*
* @param picUrl
*            图片的URL地址
* @return
* @throws IOException
*/
public static InputStream getPictureInputStream(String picUrl) throws IOException {
URL url = new URL(picUrl);
DataInputStream dis = new DataInputStream(url.openStream());// 获取图片的输入流
return dis;
}


/**
* 保存图片到本地磁盘中
*
* @param
* @throws IOException
*/
public static void savePicture(InputStream in, Picture pic) throws IOException {
String newImgUrl = "E:/picture/" + pic.getAlt().substring(6, 8) + "--" + pic.getId() + ".jpg";// 图片在磁盘上的存储路径
FileOutputStream fos = new FileOutputStream(new File(newImgUrl));
byte[] buf = new byte[1024];
int len = -1;
while ((len = in.read(buf)) > 0) {
fos.write(buf, 0, len);
}
fos.close();
}


/**
* 测试
*
* @param args
*/
public static void main(String[] args) {
try {
List<Picture> pics = getPictureUrl();
System.out.println("图片正在下载...");
for (int i = 0; i < pics.size(); i++) {
Picture pic = pics.get(i);
String picUrl = pic.getSrc();
InputStream in = getPictureInputStream(picUrl);
savePicture(in, pic);
in.close();
}
System.out.println("下载完成!");
} catch (IOException e) {
e.printStackTrace();
}
}

}


还是可以的,研究下网站结构,写出实体类就可以爬了,最后还是决定把地址隐了,抱歉

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小阳世界2023

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值