首先,我们讲述一下爬虫的基本原理。爬虫的基本原理很简单,就是利用程序访问互联网,然后将数据保存到本地中。我们都知道,互联网提供的服务大多数是以网站的形式提供的。我们需要的数据一般都是从网站中获取的,如电商网站商品信息、商品的评论、微博的信息等。爬虫和我们手动将看到的数据复制粘贴下来是类似的,只是获取大量的数据靠人工显然不太可能。因此,需要我们使用工具来帮助获取知识。使用程序编写爬虫就是使用程序编写一些网络访问的规则,将我们的目标数据保存下来。接下来,让我们开始从头搭建一个爬虫的案例。
创建Maven项目
导入pom文件依赖
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.squareup.okio/okio -->
<dependency>
<groupId>com.squareup.okio</groupId>
<artifactId>okio</artifactId>
<version>2.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- 文件下载 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
创建Test2.java类
import java.io.*;
import java.net.MalformedURLException;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Test2 {
public static void main(String[] args) throws ClientProtocolException, IOException {
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("http://www.ktbdqn.com/");
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "utf-8");
try {
//保存html源代码到本地
File fp = new File("F:/demo.html");
OutputStream os = new FileOutputStream(fp); //建立文件输出流
os.write(content.getBytes());
os.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
// 获取指定的 <img />
Elements elements = doc.select("img[src]");
Elements elements1=doc.select("link[href]");
Elements elements2=doc.select("script[src]");
for(Element element:elements2) {
String srcStr = element.attr("src");
if (!srcStr.contains("http://") && !srcStr.contains("https://")) {//没有这两个头
srcStr = Spider.path + srcStr;
HttpGet PicturehttpGet = new HttpGet(srcStr);
CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载js文件到本地,注意图片名不能重复 ✔
int s = srcStr.lastIndexOf("/");
String imgname = srcStr.substring(s);
System.out.println("开始下载js:"+srcStr);
FileUtils.copyToFile(inputStream, new File("F:/templates/js/" + imgname));
System.out.println("下载完毕");
pictureResponse.close(); // pictureResponse关闭
}
}
for(Element element:elements1) {
String srcStr = element.attr("href");
if (!srcStr.contains("http://") && !srcStr.contains("https://")) {//没有这两个头
srcStr = Spider.path + srcStr;
HttpGet PicturehttpGet = new HttpGet(srcStr);
CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载css文件到本地,注意图片名不能重复 ✔
int s = srcStr.lastIndexOf("/");
String imgname = srcStr.substring(s);
System.out.println("开始下载css:"+srcStr);
FileUtils.copyToFile(inputStream, new File("F:/templates/css/" + imgname));
System.out.println("下载完毕");
pictureResponse.close(); // pictureResponse关闭
}
}
for(Element element:elements) {
String srcStr = element.attr("src");
if (!srcStr.contains("http://") && !srcStr.contains("https://")) {//没有这两个头
srcStr = Spider.path + srcStr;
HttpGet PicturehttpGet = new HttpGet(srcStr);
CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
int s = srcStr.lastIndexOf("/");
String imgname = srcStr.substring(s);
System.out.println("开始下载图片:"+srcStr);
FileUtils.copyToFile(inputStream, new File("F:/templates/images/" + imgname));
System.out.println("下载完毕");
pictureResponse.close(); // pictureResponse关闭
}
}
response.close(); // response关闭
httpclient.close(); // httpClient关闭
}
}
运行结果
下载的地址
页面效果