1. 项目搭建
1.1 创建一个maven工程
1.1.1 引入依赖
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bjc.crawler</groupId>
<artifactId>autoHome</artifactId>
<version>0.0.1-SNAPSHOT</version>
<!-- 指定编译版本 -->
<properties>
<java.version>1.8</java.version>
</properties>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.10.RELEASE</version>
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- 添加热部署依赖 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
</dependency>
<!-- 添加 junit 环境的 jar 包 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
<!-- myBatis启动器 -->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.1.1</version>
</dependency>
<!-- mysql 数据库驱动 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- druid 数据库连接池 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.9</version>
</dependency>
<!-- 工具类 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!-- jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- 定时任务支持依赖 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
</dependency>
<!-- Quartz 坐标 -->
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.2.1</version>
<exclusions>
<exclusion>
<artifactId>slf4j-api</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
1.1.2 application.properties配置文件
#日志
logging.level.org.mybatis=DEBUG
logging.level.com.bjc=DEBUG
#DBConfiguration:
## 配置mysql驱动
spring.datasource.driverClassName=com.mysql.jdbc.Driver
## 配置连接信息
spring.datasource.url=jdbc:mysql://localhost:3306/mybatis
## 配置用户名密码
spring.datasource.username=root
spring.datasource.password=root
## 配置连接池 com.alibaba.druid.pool.DruidDataSource
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
## 配置别名包扫描
mybatis.type-aliases-package=com.bjc.pojo
1.1.3 启动类
package com.bjc;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
1.1.4 Mapper层
package com.bjc.mapper;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;
@Mapper
public interface TestMapper {
@Select("select now()")
public String queryDate();
}
注意:这里我们使用@Mapper注解,就不用写mapper映射文件了。
1.1.5 service层
package com.bjc.biz.impl;
import javax.annotation.Resource;
import org.springframework.stereotype.Service;
import com.bjc.biz.ITestBiz;
import com.bjc.mapper.TestMapper;
@Service
public class TestBiz implements ITestBiz {
@Resource
private TestMapper testMapper;
@Override
public String queryDate() {
return testMapper.queryDate();
}
}
1.2 测试类
package com.bjc.test.demo;
import javax.annotation.Resource;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.bjc.biz.ITestBiz;
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest
public class TestDemo {
@Resource
private ITestBiz testBiz;
@Test
public void test() {
System.out.println("当前时间:" + testBiz.queryDate());
}
}
运行结果:
项目基本框架搭建完成。
2. 开发分析
2.1 流程分析
首先,需求分析,理解需求要做什么,然后试着画流程图,知道程序的数据流向,最后开始设计数据库表。
我们的需求是爬取汽车之家https://www.autohome.com.cn/bestauto 的评测数据。
抓取页面的流程如下
抓取评测数据:
1)根据url抓取html页面
2)对html页面进行解析,获取该页面所有的评测数据
3)遍历所有的评测数据
4)判断遍历的评测数据是否已保存,如果已保存再次遍历下一条评测数据如果未保存执行下一步
5)保存评测数据到数据库中
2.2 数据库表
CREATE TABLE `car_test` (
`id` BIGINT(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`title` VARCHAR(100) NOT NULL COMMENT '评测车辆的名字',
`test_speed` INT(150) DEFAULT NULL COMMENT '评测项目-加速(0-100公里/小时),单位毫秒',
`test_brake` INT(150) DEFAULT NULL COMMENT '评测项目-刹车(100-0公里/小时),单位毫米',
`test_oil` INT(150) DEFAULT NULL COMMENT '评测项目-实测油耗(升/100公里),单位毫升',
`editor_name1` VARCHAR(10) DEFAULT NULL COMMENT '评测编辑1',
`editor_remark1` VARCHAR(1000) DEFAULT NULL COMMENT '点评内容1',
`editor_name2` VARCHAR(10) DEFAULT NULL COMMENT '评测编辑2',
`editor_remark2` VARCHAR(1000) DEFAULT NULL COMMENT '点评内容2',
`editor_name3` VARCHAR(10) DEFAULT NULL COMMENT '评测编辑3',
`editor_remark3` VARCHAR(1000) DEFAULT NULL COMMENT '点评内容3',
`image` VARCHAR(1000) DEFAULT NULL COMMENT '评测图片,5张图片名,中间用,分隔',
`created` DATETIME DEFAULT NULL COMMENT '创建时间',
`updated` DATETIME DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`)
) ENGINE=INNODB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8 COMMENT='汽车之家评测表';
2.3 创建pojo与mapper和service
3. 整合HttpClient的连接池管理器
3.1 编写连接池管理器
这里使用连接池管理,而连接池管理器应该交给Spring进行管理,我们这里使用以下两个注解
@Configuration注解声明配置类。
@Bean注解声明如何创建这实例
package com.bjc.config;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class HttpClientCMCfg {
@Bean
public PoolingHttpClientConnectionManager poolingHttpClientConnectionManager() {
// 创建连接管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
cm.setMaxTotal(200);
// 设置每个并发连接数
cm.setDefaultMaxPerRoute(20);
return cm;
}
}
3.2 使用定时任务清理无效的连接
3.2.1 定时器配置类
package com.bjc.config;
import org.quartz.CronTrigger;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.quartz.CronTriggerFactoryBean;
import org.springframework.scheduling.quartz.JobDetailFactoryBean;
import org.springframework.scheduling.quartz.SchedulerFactoryBean;
import com.bjc.job.CloseConnectJob;
@Configuration
public class SchedledCfg {
// 定义关闭无效连接任务
@Bean("closeConnectJobBean")
public JobDetailFactoryBean closeConnectJobBean() {
// 创建一个任务描述的工厂类
JobDetailFactoryBean jobDetailFactoryBean = new JobDetailFactoryBean();
// 设置spring容器的key,任务中可以根据这个key来获取spring容器
jobDetailFactoryBean.setApplicationContextJobDataKey("context");
// 设置任务
jobDetailFactoryBean.setJobClass(CloseConnectJob.class);
// 设置当没有触发器和任务绑定的时候,不会删除任务
jobDetailFactoryBean.setDurability(true);
return jobDetailFactoryBean;
}
// 定义关闭无效连接触发器
// @Qualifier 表示通过名字注入bean
@Bean("closeConnectJobTrigger")
public CronTriggerFactoryBean closeConnectJobTrigger(
@Qualifier(value = "closeConnectJobBean") JobDetailFactoryBean itemJobBean) {
// 创建表达式工厂类
CronTriggerFactoryBean tigger = new CronTriggerFactoryBean();
// 设置任务描述到触发器
tigger.setJobDetail(itemJobBean.getObject());
tigger.setCronExpression("0/5 * * * * ? ");
return tigger;
}
// 定义调度器
@Bean
public SchedulerFactoryBean schedulerFactory(CronTrigger[] cronTriggerImpl) {
// 创建任务调度器的工厂类
SchedulerFactoryBean bean = new SchedulerFactoryBean();
// 给任务调度器设置触发器
bean.setTriggers(cronTriggerImpl);
return bean;
}
}
3.2.2 定义任务job
package com.bjc.job;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.quartz.QuartzJobBean;
//同步执行任务(单线程),也就是当定时任务没有执行完成的情况下,不会启动新的任务
@DisallowConcurrentExecution
public class CloseConnectJob extends QuartzJobBean {
@Override
protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
// 获取spring容器
ApplicationContext applicationContext = (ApplicationContext) context.getJobDetail().getJobDataMap()
.get("context");
// 从容器中获取HttpClient连接管理器
PoolingHttpClientConnectionManager cm = applicationContext.getBean(PoolingHttpClientConnectionManager.class);
// 关闭失效连接
cm.closeExpiredConnections();
System.out.println("关闭失效连接");
}
}
4. 实现APIService
需要实现2个功能的下载
1)Get请求获取页面数据
2)Get请求下载图片
package com.bjc.biz.impl;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
import javax.annotation.Resource;
import org.apache.http.Header;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Service;
import com.bjc.biz.APIService;
@Service
public class ApiServiceImpl implements APIService {
// 注入连接管理器
@Resource
private PoolingHttpClientConnectionManager poolingHttpClientConnectionManager;
@Override
public String getHtml(String url) {
// 1. 获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
// 2. 声明HttpGet请求对象
HttpGet get = new HttpGet(url);
// 2.1 设置用户代理信息 防止服务器不允许程序访问
get.setHeader("User-Agent", "");
// 2.2 设置请求RequestConfig
get.setConfig(getConfig());
CloseableHttpResponse res = null;
try {
// 3. 使用HttpClient发起请求,返回response
res = httpClient.execute(get);
// 4. 解析response返回的数据
if(res.getStatusLine().getStatusCode() == 200) {
String html = "";
if(res.getEntity() != null) { // 只有获取到数据的时候,才执行逻辑
html = EntityUtils.toString(res.getEntity(),"UTF-8");
}
return html;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(null != res) {
try {
res.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
@Override
public String getImage(String url) {
String imageName="";
// 1. 获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
// 2. 声明HttpGet请求对象
HttpGet get = new HttpGet(url);
// 2.1 设置用户代理信息 防止服务器不允许程序访问
get.setHeader("User-Agent", "");
// 2.2 设置请求RequestConfig
get.setConfig(getConfig());
CloseableHttpResponse res = null;
try {
// 3. 使用HttpClient发起请求,返回response
res = httpClient.execute(get);
// 4. 解析response下载图片
if(res.getStatusLine().getStatusCode() == 200) {
// 使用uuid生成图片名称
String uuid = UUID.randomUUID().toString();
// image/png
Header contentType = res.getEntity().getContentType();
String value = contentType.getValue();
String[] split = value.split("/");
if(split[0].equals("image")) {
imageName = uuid + "." + split[1];
// 使用响应体输出文件
OutputStream outstream = new FileOutputStream(new File("D:/testcode1/images/" + imageName));
res.getEntity().writeTo(outstream );
return imageName;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(null != res) {
try {
res.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
// 获取请求参数对象
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 创建连接超时时间
.setConnectionRequestTimeout(500) // 设置获取连接的超时时间
.setSocketTimeout(10000) // 设置连接的超时时间
.build();
return config;
}
}
编写测试类
package com.bjc.test.demo;
import javax.annotation.Resource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.bjc.biz.APIService;
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest
public class TestApiService {
@Resource
private APIService apiService;
@Test
public void test01() {
String html = apiService.getHtml("https://www.autohome.com.cn/bestauto/1");
Document dom = Jsoup.parse(html);
System.out.println(dom.select("title").first().text());
}
@Test
public void test02() {
String html = apiService.getImage("https://car2.autoimg.cn/cardfs/product/g1/M06/08/76/1024x0_1_q95_autohomecar__ChcCQ15o-ruAXCEPAAM3LXFrlvc673.jpg");
System.out.println(html);
}
}
5. 去重过滤器
在使用网络爬虫过程中,去重是一个不可避免的问题,这里需要对抓取的数据内容进行过滤,就是对车辆型号名称进行去重过滤,避免同样条数据反复保存到数据库中。
传统的去重,可以使用Map或者Set集合、哈希表的方式来实现去重,在数据量较小的情况下,使用这种方式没有问题。可是当我们需要大量爬去数据的时候,这种方式就存在很大问题。因为会极大的占用内存和系统资源,导致爬虫系统崩溃。这里将会使用布隆过滤器
5.1 布隆过滤器
布隆过滤器 (Bloom Filter)是由Burton Howard Bloom于1970年提出,它是一种space efficient的概率型数据结构,用于判断一个元素是否在集合中。在垃圾邮件过滤的黑白名单方法、爬虫(Crawler)的网址判重模块中等等经常被用到。
哈希表也能用于判断元素是否在集合中,但是布隆过滤器只需要哈希表的1/8或1/4的空间复杂度就能完成同样的问题。布隆过滤器可以插入元素,但不可以删除已有元素。其中的元素越多,误报率越大,但是漏报是不可能的
5.2 布隆过滤器的实现
package com.bjc.utils;
import java.util.BitSet;
import org.apache.commons.lang3.StringUtils;
//去重过滤器,布隆过滤器
public class TitleFilter {
/* BitSet初始分配2^24个bit */
private static final int DEFAULT_SIZE = 1 << 24;
/* 不同哈希函数的种子,一般应取质数 */
private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37 };
private BitSet bits = new BitSet(DEFAULT_SIZE);
/* 哈希函数对象 */
private SimpleHash[] func = new SimpleHash[seeds.length];
public TitleFilter() {
for (int i = 0; i < seeds.length; i++) {
func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
// 将url标记到bits中
public void add(String str) {
for (SimpleHash f : func) {
bits.set(f.hash(str), true);
}
}
// 判断是否已经被bits标记
public boolean contains(String str) {
if (StringUtils.isBlank(str)) {
return false;
}
boolean ret = true;
for (SimpleHash f : func) {
ret = ret && bits.get(f.hash(str));
}
return ret;
}
/* 哈希函数类 */
public static class SimpleHash {
private int cap;
private int seed;
public SimpleHash(int cap, int seed) {
this.cap = cap;
this.seed = seed;
}
// hash函数,采用简单的加权和hash
public int hash(String value) {
int result = 0;
int len = value.length();
for (int i = 0; i < len; i++) {
result = seed * result + value.charAt(i);
}
return (cap - 1) & result;
}
}
}
5.3 初始化布隆过滤器
package com.bjc.config;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.bjc.biz.CarTestService;
import com.bjc.utils.TitleFilter;
@Configuration
public class TitleFilterCfg {
@Autowired
private CarTestService carTestService;
@Bean
public TitleFilter titleFilter() {
// 创建车辆标题过滤器
TitleFilter titleFilter = new TitleFilter();
// 从数据库查询车辆标题,分页查询
List<String> list = carTestService.queryByPage(1, 5000);
// 遍历查询结果
for (String str : list) {
// 把查询到的数据放到过滤器中
titleFilter.add(str);
}
// 返回创建好的过滤器
return titleFilter;
}
}
5.4 业务代码编写
5.4.1 Mapper
package com.bjc.mapper;
import java.util.List;
import java.util.Map;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;
import com.bjc.pojo.CarTest;
@Mapper
public interface CarMapper {
@Select("SELECT title FROM car_test LIMIT #{start},#{rows}")
List<String> queryByPage(Map<String, Object> map);
/**
* 保存
*
* @param carTest
*/
@Insert(
"INSERT INTO `car_test` (" +
" `title`," +
" `test_speed`," +
" `test_brake`," +
" `test_oil`," +
" `editor_name1`," +
" `editor_remark1`," +
" `editor_name2`," +
" `editor_remark2`," +
" `editor_name3`," +
" `editor_remark3`," +
" `image`," +
" `created`," +
" `updated`" +
")" +
"VALUES" +
" (" +
" #{title}," +
" #{test_speed}," +
" #{test_brake}," +
" #{test_oil}," +
" #{editor_name1}," +
" #{editor_remark1}," +
" #{editor_name2}," +
" #{editor_remark2}," +
" #{editor_name3}," +
" #{editor_remark3}," +
" #{image}," +
" #{created}," +
" #{updated}" +
" )")
void save(CarTest carTest);
}
5.4.2 service
package com.bjc.biz.impl;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Resource;
import org.springframework.stereotype.Service;
import com.bjc.biz.CarTestService;
import com.bjc.mapper.CarMapper;
import com.bjc.pojo.CarTest;
@Service
public class CarTestServiceImpl implements CarTestService {
@Resource
private CarMapper carMapper;
@Override
public List<String> queryByPage(int i, int j) {
int start = (i-1) * j;
Map<String,Object> map = new HashMap<String,Object>();
map.put("start", start);
map.put("rows", j);
return carMapper.queryByPage(map);
}
@Override
public void saveCarTest(CarTest carTest) {
this.carMapper.save(carTest);
}
}
6. 实现爬虫
6.1 测试方法
package com.bjc.test.demo;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.bjc.Application;
import com.bjc.biz.APIService;
import com.bjc.biz.CarTestService;
import com.bjc.pojo.CarTest;
import com.bjc.utils.TitleFilter;
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest(classes = Application.class)
public class CrawlerAutohome {
@Autowired
private APIService apiService;
@Autowired
private CarTestService carTestService;
@Autowired
private TitleFilter titleFilter;
@Test
public void testCrawlerAutohome() throws Exception {
//遍历所有的url
for (int i = 1; i < 139; i++) {
String html = apiService.getHtml("https://www.autohome.com.cn/bestauto/" + i);
Document doc = Jsoup.parse(html);
// 获取每获取评测信息
Elements cars = doc.select("#bestautocontent div.uibox");
// 遍历评测信息
for (Element car : cars) {
// 去重判读
String title = car.getElementsByClass("uibox-title uibox-title-border").text();
if (titleFilter.contains(title)) {
// 如果包含了,就不保存了,遍历下一个
continue;
}
if(null != car) {
try {
// 创建评测对象,封装数据
CarTest carTest = copyCarTest(car);
// 评测图片,下载图片
String image = getImage(car);
// 设置图片
carTest.setImage(image);
// 保存数据
saveCarTest(carTest);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
}
/**
* 保存汽车评测数据
*
* @param carTest
*/
private void saveCarTest(CarTest carTest) {
titleFilter.add(carTest.getTitle());
carTestService.saveCarTest(carTest);
}
/**
* 解析数据下载评测图片
*
* @param car
* @return
*/
private String getImage(Element car) {
List<String> images = new ArrayList<String>();
Elements elements = car.select(".piclist-box ul.piclist02 a");
for (Element element : elements) {
String url = "https:" + element.attr("href");
String html = apiService.getHtml(url);
Document doc = Jsoup.parse(html);
String picUrl = "https:" + doc.select("#img").attr("src");
String image = apiService.getImage(picUrl);
images.add(image);
break;
}
return StringUtils.join(images, ",");
}
/**
* 解析数据封装成汽车评测对象
*
* @param car
* @return
*/
private CarTest copyCarTest(Element car) {
CarTest carTest = new CarTest();
// 评测车辆标题
String title = car.getElementsByClass("uibox-title uibox-title-border").text();
carTest.setTitle(title);
// 评测项目-加速(0-100公里/小时),单位毫秒
String speed = car.select(".tabbox1 dd:nth-child(2) div.dd-div2").first().text();
carTest.setTest_speed(strToNum(speed));
// 评测项目-刹车(100-0公里/小时),单位毫米
String brake = car.select(".tabbox1 dd:nth-child(3) div.dd-div2").first().text();
carTest.setTest_brake(strToNum(brake));
// 评测项目-实测油耗(升/100公里),单位毫升
String oil = car.select(".tabbox1 dd:nth-child(4) div.dd-div2").first().text();
carTest.setTest_oil(strToNum(oil));
// 评测编辑1
carTest.setEditor_name1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div1").first().text());
// 点评内容1
carTest.setEditor_remark1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div3").first().text());
// 评测编辑2
carTest.setEditor_name2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div1").first().text());
// 点评内容2
carTest.setEditor_remark2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div3").first().text());
// 评测编辑3
carTest.setEditor_name3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div1").first().text());
// 点评内容3
carTest.setEditor_remark3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div3").first().text());
// 设置时间
carTest.setCreated(new Date());
carTest.setUpdated(carTest.getCreated());
return carTest;
}
/**
* 把字符串去掉最后一个数,转为乘以1000的数字
*
* @param speed
* @return
*/
private int strToNum(String str) {
if(StringUtils.isEmpty(str)) {
return 0;
}
try {
// 字符串去掉随后一个数
str = StringUtils.substring(str, 0, str.length() - 1);
// 转换为小数并乘以1000
Number num = Float.valueOf(str) * 1000;
return num.intValue();
} catch (Exception e) {
e.printStackTrace();
System.out.println(str);
}
return 0;
}
}
6.2 整合任务
把测试方法中的爬取数据代码改造为任务,再使用Quartz定时任务定时处理,就可以实现定时抓取汽车评测数据,能够获取最新的数据了
6.2.1 改造任务
package com.bjc.job;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.quartz.QuartzJobBean;
import com.bjc.biz.APIService;
import com.bjc.biz.CarTestService;
import com.bjc.pojo.CarTest;
import com.bjc.utils.TitleFilter;
//同步执行任务(单线程)
@DisallowConcurrentExecution
public class CrawlerAutohomeJob extends QuartzJobBean {
private APIService apiService;
private CarTestService carTestService;
private TitleFilter titleFilter;
@Override
protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
// 获取spring容器
ApplicationContext applicationContext = (ApplicationContext) context.getJobDetail().getJobDataMap()
.get("context");
// 获取抓取数据服务
this.apiService = applicationContext.getBean(APIService.class);
// 获取汽车评测服务
this.carTestService = applicationContext.getBean(CarTestService.class);
// 获取过滤器
this.titleFilter = applicationContext.getBean(TitleFilter.class);
// 遍历所有的url
for (int i = 1; i < 139; i++) {
// 抓取页面数据
String html = this.apiService.getHtml("https://www.autohome.com.cn/bestauto/" + i);
// 使用jsoup解析为Document对象
Document doc = Jsoup.parse(html);
// 获取每获取评测信息
Elements cars = doc.select("#bestautocontent div.uibox");
// 遍历评测信息
for (Element car : cars) {
// 去重判读
String title = car.getElementsByClass("uibox-title uibox-title-border").text();
if (this.titleFilter.contains(title)) {
// 如果包含了,就不保存了,遍历下一个
continue;
}
if(null != car) {
try {
// 创建评测对象,封装数据
CarTest carTest = this.copyCarTest(car);
// 评测图片,下载图片
String image = this.getImage(car);
// 设置图片
carTest.setImage(image);
// 保存数据
this.saveCarTest(carTest);
} catch (Exception e) {
}
}
}
}
}
/**
* 保存汽车评测数据
*
* @param carTest
*/
private void saveCarTest(CarTest carTest) {
this.titleFilter.add(carTest.getTitle());
this.carTestService.saveCarTest(carTest);
}
/**
* 解析数据下载评测图片
*
* @param car
* @return
*/
private String getImage(Element car) {
List<String> images = new ArrayList<String>();
Elements elements = car.select(".piclist-box ul.piclist02 a");
for (Element element : elements) {
String url = "https:" + element.attr("href");
String html = this.apiService.getHtml(url);
Document doc = Jsoup.parse(html);
String picUrl = "https:" + doc.select("#img").attr("src");
String image = this.apiService.getImage(picUrl);
images.add(image);
break;
}
return images.toString();
}
/**
* 解析数据封装成汽车评测对象
*
* @param car
* @return
*/
private CarTest copyCarTest(Element car) {
CarTest carTest = new CarTest();
// 评测车辆标题
String title = car.getElementsByClass("uibox-title uibox-title-border").text();
carTest.setTitle(title);
// 评测项目-加速(0-100公里/小时),单位毫秒
String speed = car.select(".tabbox1 dd:nth-child(2) div.dd-div2").first().text();
carTest.setTest_speed(this.strToNum(speed));
// 评测项目-刹车(100-0公里/小时),单位毫米
String brake = car.select(".tabbox1 dd:nth-child(3) div.dd-div2").first().text();
carTest.setTest_brake(this.strToNum(brake));
// 评测项目-实测油耗(升/100公里),单位毫升
String oil = car.select(".tabbox1 dd:nth-child(4) div.dd-div2").first().text();
carTest.setTest_oil(this.strToNum(oil));
// 评测编辑1
carTest.setEditor_name1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div1").first().text());
// 点评内容1
carTest.setEditor_remark1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div3").first().text());
// 评测编辑2
carTest.setEditor_name2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div1").first().text());
// 点评内容2
carTest.setEditor_remark2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div3").first().text());
// 评测编辑3
carTest.setEditor_name3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div1").first().text());
// 点评内容3
carTest.setEditor_remark3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div3").first().text());
// 设置时间
carTest.setCreated(new Date());
carTest.setUpdated(carTest.getCreated());
return carTest;
}
/**
* 把字符串去掉最后一个数,转为乘以1000的数字
*
* @param speed
* @return
*/
private int strToNum(String str) {
try {
// 字符串去掉随后一个数
str = StringUtils.substring(str, 0, str.length() - 1);
// 转换为小数并乘以1000
Number num = Float.valueOf(str) * 1000;
return num.intValue();
} catch (Exception e) {
e.printStackTrace();
System.out.println(str);
}
return 0;
}
}
6.2.2 定时任务处理
package com.bjc.config;
import org.quartz.CronTrigger;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.quartz.CronTriggerFactoryBean;
import org.springframework.scheduling.quartz.JobDetailFactoryBean;
import org.springframework.scheduling.quartz.SchedulerFactoryBean;
import com.bjc.job.CloseConnectJob;
import com.bjc.job.CrawlerAutohomeJob;
@Configuration
public class SchedledAutoHomeCfg {
// 定义关闭无效连接任务
@Bean("crawlerAutohomeJobBean")
public JobDetailFactoryBean crawlerAutohomeJobBean() {
JobDetailFactoryBean jobDetailFactoryBean = new JobDetailFactoryBean();
jobDetailFactoryBean.setApplicationContextJobDataKey("context");
jobDetailFactoryBean.setJobClass(CrawlerAutohomeJob.class);
jobDetailFactoryBean.setDurability(true);
return jobDetailFactoryBean;
}
// 定义关闭无效连接触发器
@Bean("crawlerAutohomeJobTrigger")
public CronTriggerFactoryBean crawlerAutohomeJobTrigger(
@Qualifier(value = "crawlerAutohomeJobBean") JobDetailFactoryBean itemJobBean) {
CronTriggerFactoryBean tigger = new CronTriggerFactoryBean();
tigger.setJobDetail(itemJobBean.getObject());
tigger.setCronExpression("0/5 * * * * ? ");
return tigger;
}
// 定义关闭无效连接任务
@Bean("closeConnectJobBean")
public JobDetailFactoryBean closeConnectJobBean() {
JobDetailFactoryBean jobDetailFactoryBean = new JobDetailFactoryBean();
jobDetailFactoryBean.setApplicationContextJobDataKey("context");
jobDetailFactoryBean.setJobClass(CloseConnectJob.class);
jobDetailFactoryBean.setDurability(true);
return jobDetailFactoryBean;
}
// 定义关闭无效连接触发器
@Bean("closeConnectJobTrigger")
public CronTriggerFactoryBean closeConnectJobTrigger(
@Qualifier(value = "closeConnectJobBean") JobDetailFactoryBean itemJobBean) {
CronTriggerFactoryBean tigger = new CronTriggerFactoryBean();
tigger.setJobDetail(itemJobBean.getObject());
tigger.setCronExpression("0/5 * * * * ? ");
return tigger;
}
// 定义调度器
@Bean
public SchedulerFactoryBean schedulerFactory(CronTrigger[] cronTriggerImpl) {
SchedulerFactoryBean bean = new SchedulerFactoryBean();
bean.setTriggers(cronTriggerImpl);
return bean;
}
}