最近接触了下java的爬虫,文本信息爬完了,就想看看图片怎么爬,于是就研究了一下,案例爬学校的官网
pom依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- 文件下载 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
设置配置信息:
public class test {
// 地址
private static final String URL = "http://www.ktbdqn.com/";
// 编码
private static final String ECODING = "GBK";
// 获取img标签正则
private static final String IMGURL_REG = "<img.*src\\s*=\\s*(.*?)[^>]*?>";
//获取link标签正则
private static final String LINKURL_REG = "<link.*href\\s*=\\s*(.*?)[^>]*?>";
// 获取Img的src路径的正则
private static final String IMGSRC_REG = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
// 获取Link的href路径的正则
private static final String LINKSRC_REG = "(?x)(href|HREF)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(css|CSS|([\\w-]+/)*([\\w-]+\\.(css|CSS|([\\w-]+/)*([\\w-]+\\.(css|CSS)))))))('|\")";
// css本地保存路径
private static final String SAVE_CSS_PATH = "d:\\cskt\\";
// img本地保存路径
private static final String SAVE_PATH = "d:\\";
这里注意有些人获取的网页会乱码,根据爬取的网页编码和自己的编码保持一致
图片下载
public static void Download(List<String> listImgSrc) {
int count = 0;
try {
for (int i = 0; i < listImgSrc.size(); i++) {
String url = listImgSrc.get(i);
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
// 打开连接
URLConnection con = uri.openConnection();
//设置请求超时为5s