1. 网络爬虫
- 网络爬虫(又被称为网页蜘蛛,网络机器人),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。
- 另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫.
2. 流程
- 网络爬虫是做什么的? 他的主要工作就是 跟据指定的url地址 去发送请求,获得响应, 然后解析响应 , 一方面从响应中查找出想要查找的数据,另一方面从响应中解析出新的URL路径,然后继续访问,继续解析;继续查找需要的数据和继续解析出新的URL路径 .
- 这就是网络爬虫主要干的工作. 下面是流程图:
3.代码实现
pom文件
<dependencies>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<!-- 文件下载 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
</dependencies>
代码
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
public class Pa {
/**
* @param url 要抓取的网页地址
* @param encoding 要抓取网页编码
* @return
*/
public static String getHtmlResourceByUrl(String url, String encoding) {
URL urlObj = null;
URLConnection uc = null;
InputStreamReader isr = null;
BufferedReader reader = null;
StringBuffer buffer = new StringBuffer();
// 建立网络连接
try {
urlObj = new URL(url);
// 打开网络连接
uc = urlObj.openConnection();
// 建立文件输入流
isr = new InputStreamReader(uc.getInputStream(), encoding);
// 建立缓存导入 将网页源代码下载下来
reader = new BufferedReader(isr);
// 临时
String temp = null;
while ((temp = reader.readLine()) != null) {// 一次读一行 只要不为空就说明没读完继续读
// System.out.println(temp+"\n");
buffer.append(temp + "\n");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
// 关流
if (isr != null) {
try {
isr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return buffer.toString();
}
public static void getJobInfo(String url, String encoding) {
// 拿到网页源代码
String html = getHtmlResourceByUrl(url, encoding);
try {
File fp = new File("E:/cskt/cskt.html");
OutputStream os = new FileOutputStream(fp); //建立文件输出流
os.write(html.getBytes());
os.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/*
* 获得图片和样式
* */
public static void getImg(String uri) throws ClientProtocolException, IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet(uri);
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "gbk");
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select("img");
for (int i = 0; i < elements.size(); i++) {
Element element = elements.get(i);
// 获取 <img /> 的 src
String url = element.attr("src");
System.out.println(url);
// 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全
System.out.println(i + 1 + ":" + "http://www.ktbdqn.com/" + url);
String url2 = "http://www.ktbdqn.com/" + url;
HttpGet picturehttpGet = new HttpGet(url2);
CloseableHttpResponse pictureResponse = httpclient.execute(picturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复
FileUtils.copyToFile(inputStream, new File("E://cskt//" + url));
pictureResponse.close(); // pictureResponse关闭
}
/*获得样式*/
Elements element = doc.select("link");
for (int i = 0; i < element.size(); i++) {
Element elemen = element.get(i);
// 获取 <link /> 的 href
String url = elemen.attr("href");
System.out.println(url);
// 再发请求最简单了,并由于该链接是没有 https:开头的,得人工补全
System.out.println(i + 1 + ":" + "http://www.ktbdqn.com/" + url);
String url2 = "http://www.ktbdqn.com/" + url;
HttpGet picturehttpGet = new HttpGet(url2);
CloseableHttpResponse pictureResponse = httpclient.execute(picturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载样式到本地,注意样式名不能重复
FileUtils.copyToFile(inputStream, new File("E://cskt//" + url));
pictureResponse.close(); // pictureResponse关闭
}
response.close(); // response关闭
httpclient.close(); // httpClient关闭
}
public static void main(String[] args) throws ClientProtocolException, IOException {
getJobInfo("http://www.ktbdqn.com/", "utf-8");
getImg("http://www.ktbdqn.com/");
}
}
实现效果