工具:
- HttpCilent 模拟发送请求,获取网站Html数据
- Jsoup 解析Html数据,获取图片链接
- Firebug 查看页面信息,寻找爬取规律
代码:
package ren.hz.spider.mzitu
import java.io.File
import java.io.FileOutputStream
import java.io.IOException
import java.io.InputStream
import org.apache.http.HttpEntity
import org.apache.http.client.ClientProtocolException
import org.apache.http.client.methods.CloseableHttpResponse
import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
public class GetInfoFromMz {
public String getInfo() {
//创建HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault()
//新建Get请求
HttpGet get = new HttpGet("http://www.mzitu.com/all")
//创建响应接受
CloseableHttpResponse response
try {
//执行请求
response = httpClient.execute(get)
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity()
String html = EntityUtils.toString(entity)
//使用Jsoup解析返回的html
Document document = Jsoup.parse(html)
//获取相应dom标签
Elements div_all = document.select("div.all")
//获取该节点下的所有a标签
Elements as = div_all.select("a")
//输出获取的标签数
System.out.println(as.size())
for (Element a : as) {
//获取a标签文字内容,去除空格,作为保存文件名
String title = a.text().trim()
//获取a标签链接
String link = a.attr("href")
//本地目录
String path = "E:/mzitu/" + title
HttpGet get2 = new HttpGet(link)
response = httpClient.execute(get2)
Document document2 = Jsoup.parse(EntityUtils.toString(response.getEntity()))
//根据页面源码可以知道最大页码值在第21个span处
String max_span = document2.select("span").get(10).text()
//组装url
for (int i = 1
String url = link + "/" + i
if (i == 1) {
url = link
}
System.out.println(url)
HttpGet get3 = new HttpGet(url)
response = httpClient.execute(get3)
Document document3 = Jsoup.parse(EntityUtils.toString(response.getEntity()))
//获取图片地址
String img_url = document3.select("div.main-image").select("img").attr("src")
//根据图片地址,使用流的方式获取图片并存盘
InputStream ipt = httpClient.execute(new HttpGet(img_url)).getEntity().getContent()
File file = new File(path + "/" + img_url.substring(img_url.lastIndexOf("/")))
if (!file.exists()) {
if (!file.getParentFile().exists()) {
file.getParentFile().mkdirs()
}
file.createNewFile()
}
FileOutputStream fileOutputStream = new FileOutputStream(file)
byte[] bytes = new byte[1024]
int j = 0
while ((j = ipt.read(bytes)) != -1) {
fileOutputStream.write(bytes, 0, j)
}
fileOutputStream.flush()
fileOutputStream.close()
}
}
}
} catch (ClientProtocolException e) {
e.printStackTrace()
} catch (IOException e) {
e.printStackTrace()
} finally {
try {
httpClient.close()
} catch (IOException e) {
e.printStackTrace()
}
}
return null
}
public static void main(String[] args) {
new GetInfoFromMz().getInfo()
}
}