Springboot1.5学习13——练习爬虫Demo

1. 项目搭建

1.1 创建一个maven工程

1.1.1 引入依赖

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.bjc.crawler</groupId>
  <artifactId>autoHome</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  	<!-- 指定编译版本 -->
	<properties>
		<java.version>1.8</java.version>
	</properties>

	<parent>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-parent</artifactId>
		<version>1.5.10.RELEASE</version>
	</parent>
	
	<dependencies>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>

		<!-- 添加热部署依赖 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-devtools</artifactId>
		</dependency>
		
		<!-- 添加 junit 环境的 jar 包 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-test</artifactId>
		</dependency>
		
		<!-- myBatis启动器 -->
		<dependency>
			<groupId>org.mybatis.spring.boot</groupId>
			<artifactId>mybatis-spring-boot-starter</artifactId>
			<version>1.1.1</version>
		</dependency>
		
		<!-- mysql 数据库驱动 -->
		<dependency>
		    <groupId>mysql</groupId>
		    <artifactId>mysql-connector-java</artifactId>
		</dependency>
		<!-- druid 数据库连接池 -->
		<dependency>
		    <groupId>com.alibaba</groupId>
		    <artifactId>druid</artifactId>
		    <version>1.0.9</version>
		</dependency>
		
		<!-- 工具类 -->
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.3.2</version>
		</dependency>
		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>2.6</version>
		</dependency>
		
		<!-- HttpClient -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
		</dependency>
	
		<!-- jsoup -->
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.10.3</version>
		</dependency>
		
		<!-- 定时任务支持依赖 -->
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context-support</artifactId>
		</dependency>
		<!-- Quartz 坐标 -->
		<dependency>
			<groupId>org.quartz-scheduler</groupId>
			<artifactId>quartz</artifactId>
			<version>2.2.1</version>
			<exclusions>
				<exclusion>
					<artifactId>slf4j-api</artifactId>
					<groupId>org.slf4j</groupId>
				</exclusion>
			</exclusions>
		</dependency>
		
	</dependencies>
  
</project>

1.1.2 application.properties配置文件

#日志
logging.level.org.mybatis=DEBUG
logging.level.com.bjc=DEBUG

#DBConfiguration:
## 配置mysql驱动
spring.datasource.driverClassName=com.mysql.jdbc.Driver
## 配置连接信息
spring.datasource.url=jdbc:mysql://localhost:3306/mybatis
## 配置用户名密码
spring.datasource.username=root
spring.datasource.password=root

##  配置连接池 com.alibaba.druid.pool.DruidDataSource
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource

##  配置别名包扫描
mybatis.type-aliases-package=com.bjc.pojo

1.1.3 启动类

package com.bjc;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class Application {

	public static void main(String[] args) {
		SpringApplication.run(Application.class, args);
	}

}

1.1.4 Mapper层

package com.bjc.mapper;

import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;

@Mapper
public interface TestMapper {

	@Select("select now()")
	public String queryDate();
	
}

注意:这里我们使用@Mapper注解,就不用写mapper映射文件了。

1.1.5 service层

package com.bjc.biz.impl;

import javax.annotation.Resource;

import org.springframework.stereotype.Service;

import com.bjc.biz.ITestBiz;
import com.bjc.mapper.TestMapper;

@Service
public class TestBiz implements ITestBiz {

	@Resource
	private TestMapper testMapper;
	
	@Override
	public String queryDate() {
		return testMapper.queryDate();
	}

}

1.2 测试类

package com.bjc.test.demo;

import javax.annotation.Resource;

import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;

import com.bjc.biz.ITestBiz;


@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest
public class TestDemo {
	@Resource
	private ITestBiz testBiz;
	
	@Test
	public void test() {
		System.out.println("当前时间:" + testBiz.queryDate());
	}
}

运行结果:

项目基本框架搭建完成。

2. 开发分析

2.1 流程分析

首先,需求分析,理解需求要做什么,然后试着画流程图,知道程序的数据流向,最后开始设计数据库表。

我们的需求是爬取汽车之家https://www.autohome.com.cn/bestauto 的评测数据。

抓取页面的流程如下

 

抓取评测数据:

1)根据url抓取html页面

2)对html页面进行解析,获取该页面所有的评测数据

3)遍历所有的评测数据

4)判断遍历的评测数据是否已保存,如果已保存再次遍历下一条评测数据如果未保存执行下一步

5)保存评测数据到数据库中

2.2 数据库表

CREATE TABLE `car_test` (
  `id` BIGINT(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `title` VARCHAR(100) NOT NULL COMMENT '评测车辆的名字',
  `test_speed` INT(150) DEFAULT NULL COMMENT '评测项目-加速(0-100公里/小时),单位毫秒',
  `test_brake` INT(150) DEFAULT NULL COMMENT '评测项目-刹车(100-0公里/小时),单位毫米',
  `test_oil` INT(150) DEFAULT NULL COMMENT '评测项目-实测油耗(升/100公里),单位毫升',
  `editor_name1` VARCHAR(10) DEFAULT NULL COMMENT '评测编辑1',
  `editor_remark1` VARCHAR(1000) DEFAULT NULL COMMENT '点评内容1',
  `editor_name2` VARCHAR(10) DEFAULT NULL COMMENT '评测编辑2',
  `editor_remark2` VARCHAR(1000) DEFAULT NULL COMMENT '点评内容2',
  `editor_name3` VARCHAR(10) DEFAULT NULL COMMENT '评测编辑3',
  `editor_remark3` VARCHAR(1000) DEFAULT NULL COMMENT '点评内容3',
  `image` VARCHAR(1000) DEFAULT NULL COMMENT '评测图片,5张图片名,中间用,分隔',
  `created` DATETIME DEFAULT NULL COMMENT '创建时间',
  `updated` DATETIME DEFAULT NULL COMMENT '更新时间',
  PRIMARY KEY (`id`)
) ENGINE=INNODB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8 COMMENT='汽车之家评测表';

2.3 创建pojo与mapper和service

3. 整合HttpClient的连接池管理器

3.1 编写连接池管理器

这里使用连接池管理,而连接池管理器应该交给Spring进行管理,我们这里使用以下两个注解
@Configuration注解声明配置类。
@Bean注解声明如何创建这实例

package com.bjc.config;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class HttpClientCMCfg {
	@Bean
	public PoolingHttpClientConnectionManager poolingHttpClientConnectionManager() {
		// 创建连接管理器
		PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

		// 设置最大连接数
		cm.setMaxTotal(200);

		// 设置每个并发连接数
		cm.setDefaultMaxPerRoute(20);

		return cm;
	}
}

3.2 使用定时任务清理无效的连接

3.2.1 定时器配置类

package com.bjc.config;

import org.quartz.CronTrigger;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.quartz.CronTriggerFactoryBean;
import org.springframework.scheduling.quartz.JobDetailFactoryBean;
import org.springframework.scheduling.quartz.SchedulerFactoryBean;

import com.bjc.job.CloseConnectJob;

@Configuration
public class SchedledCfg {

	// 定义关闭无效连接任务
	@Bean("closeConnectJobBean")
	public JobDetailFactoryBean closeConnectJobBean() {
		// 创建一个任务描述的工厂类
		JobDetailFactoryBean jobDetailFactoryBean = new JobDetailFactoryBean();
		// 设置spring容器的key,任务中可以根据这个key来获取spring容器
		jobDetailFactoryBean.setApplicationContextJobDataKey("context");
		// 设置任务
		jobDetailFactoryBean.setJobClass(CloseConnectJob.class);
		// 设置当没有触发器和任务绑定的时候,不会删除任务
		jobDetailFactoryBean.setDurability(true);

		return jobDetailFactoryBean;
	}

	// 定义关闭无效连接触发器
	// @Qualifier 表示通过名字注入bean
	@Bean("closeConnectJobTrigger")
	public CronTriggerFactoryBean closeConnectJobTrigger(
			@Qualifier(value = "closeConnectJobBean") JobDetailFactoryBean itemJobBean) {
		// 创建表达式工厂类
		CronTriggerFactoryBean tigger = new CronTriggerFactoryBean();
		// 设置任务描述到触发器
		tigger.setJobDetail(itemJobBean.getObject());
		tigger.setCronExpression("0/5 * * * * ? ");
		return tigger;
	}

	// 定义调度器
	@Bean
	public SchedulerFactoryBean schedulerFactory(CronTrigger[] cronTriggerImpl) {
		// 创建任务调度器的工厂类
		SchedulerFactoryBean bean = new SchedulerFactoryBean();
		// 给任务调度器设置触发器
		bean.setTriggers(cronTriggerImpl);
		return bean;
	}
}

3.2.2 定义任务job

package com.bjc.job;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.quartz.QuartzJobBean;

//同步执行任务(单线程),也就是当定时任务没有执行完成的情况下,不会启动新的任务
@DisallowConcurrentExecution
public class CloseConnectJob extends QuartzJobBean {

	@Override
	protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
		// 获取spring容器
		ApplicationContext applicationContext = (ApplicationContext) context.getJobDetail().getJobDataMap()
				.get("context");

		// 从容器中获取HttpClient连接管理器
		PoolingHttpClientConnectionManager cm = applicationContext.getBean(PoolingHttpClientConnectionManager.class);

		// 关闭失效连接
		cm.closeExpiredConnections();
		System.out.println("关闭失效连接");
	}

}

4. 实现APIService

需要实现2个功能的下载

1)Get请求获取页面数据

2)Get请求下载图片

package com.bjc.biz.impl;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

import javax.annotation.Resource;

import org.apache.http.Header;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Service;

import com.bjc.biz.APIService;

@Service
public class ApiServiceImpl implements APIService {

	// 注入连接管理器
	@Resource
	private PoolingHttpClientConnectionManager poolingHttpClientConnectionManager;
	
	@Override
	public String getHtml(String url) {
		// 1. 获取HttpClient对象
		CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
		
		// 2. 声明HttpGet请求对象
		HttpGet get = new HttpGet(url);
		// 2.1 设置用户代理信息  防止服务器不允许程序访问
		get.setHeader("User-Agent", "");
		// 2.2 设置请求RequestConfig
		get.setConfig(getConfig());
		CloseableHttpResponse res = null;
		try {
			// 3. 使用HttpClient发起请求,返回response
			res = httpClient.execute(get);
			
			// 4. 解析response返回的数据
			if(res.getStatusLine().getStatusCode() == 200) {
				String html  = "";
				if(res.getEntity() != null) {  // 只有获取到数据的时候,才执行逻辑
					html = EntityUtils.toString(res.getEntity(),"UTF-8");
				}
				return html;
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if(null != res) {
				try {
					res.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		
		return null;
	}

	@Override
	public String getImage(String url) {
		
		String imageName="";
		
		// 1. 获取HttpClient对象
		CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
		
		// 2. 声明HttpGet请求对象
		HttpGet get = new HttpGet(url);
		// 2.1 设置用户代理信息  防止服务器不允许程序访问
		get.setHeader("User-Agent", "");
		// 2.2 设置请求RequestConfig
		get.setConfig(getConfig());
		CloseableHttpResponse res = null;
		try {
			// 3. 使用HttpClient发起请求,返回response
			res = httpClient.execute(get);
			
			// 4. 解析response下载图片
			if(res.getStatusLine().getStatusCode() == 200) {
				
				// 使用uuid生成图片名称
				String uuid = UUID.randomUUID().toString();
				
				// image/png
				Header contentType = res.getEntity().getContentType();
				String value = contentType.getValue();
				String[] split = value.split("/");
				if(split[0].equals("image")) {
					imageName = uuid + "." + split[1];
					// 使用响应体输出文件
					OutputStream outstream = new FileOutputStream(new File("D:/testcode1/images/" + imageName));
					res.getEntity().writeTo(outstream );
					return imageName;
				}
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if(null != res) {
				try {
					res.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		
		return null;
	}
	
	// 获取请求参数对象
	private RequestConfig getConfig() {
		RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 创建连接超时时间
								.setConnectionRequestTimeout(500) // 设置获取连接的超时时间
								.setSocketTimeout(10000)	// 设置连接的超时时间
								.build();
		return config;
		
	}

}

编写测试类

package com.bjc.test.demo;

import javax.annotation.Resource;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;

import com.bjc.biz.APIService;

@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest
public class TestApiService {
	
	@Resource
	private APIService apiService;
	
	@Test
	public void test01() {
		String html = apiService.getHtml("https://www.autohome.com.cn/bestauto/1");
		Document dom = Jsoup.parse(html);
		System.out.println(dom.select("title").first().text());
	}
	
	@Test
	public void test02() {
		String html = apiService.getImage("https://car2.autoimg.cn/cardfs/product/g1/M06/08/76/1024x0_1_q95_autohomecar__ChcCQ15o-ruAXCEPAAM3LXFrlvc673.jpg");
		System.out.println(html);
	}
	
}

5. 去重过滤器

        在使用网络爬虫过程中,去重是一个不可避免的问题,这里需要对抓取的数据内容进行过滤,就是对车辆型号名称进行去重过滤,避免同样条数据反复保存到数据库中。

        传统的去重,可以使用Map或者Set集合、哈希表的方式来实现去重,在数据量较小的情况下,使用这种方式没有问题。可是当我们需要大量爬去数据的时候,这种方式就存在很大问题。因为会极大的占用内存和系统资源,导致爬虫系统崩溃。这里将会使用布隆过滤器

5.1 布隆过滤器

        布隆过滤器 (Bloom Filter)是由Burton Howard Bloom于1970年提出,它是一种space efficient的概率型数据结构,用于判断一个元素是否在集合中。在垃圾邮件过滤的黑白名单方法、爬虫(Crawler)的网址判重模块中等等经常被用到。
        哈希表也能用于判断元素是否在集合中,但是布隆过滤器只需要哈希表的1/8或1/4的空间复杂度就能完成同样的问题。布隆过滤器可以插入元素,但不可以删除已有元素。其中的元素越多,误报率越大,但是漏报是不可能的

5.2 布隆过滤器的实现

package com.bjc.utils;

import java.util.BitSet;

import org.apache.commons.lang3.StringUtils;

//去重过滤器,布隆过滤器
public class TitleFilter {

	/* BitSet初始分配2^24个bit */
	private static final int DEFAULT_SIZE = 1 << 24;

	/* 不同哈希函数的种子,一般应取质数 */
	private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37 };

	private BitSet bits = new BitSet(DEFAULT_SIZE);

	/* 哈希函数对象 */
	private SimpleHash[] func = new SimpleHash[seeds.length];

	public TitleFilter() {
		for (int i = 0; i < seeds.length; i++) {
			func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
		}
	}

	// 将url标记到bits中
	public void add(String str) {
		for (SimpleHash f : func) {
			bits.set(f.hash(str), true);
		}
	}

	// 判断是否已经被bits标记
	public boolean contains(String str) {
		if (StringUtils.isBlank(str)) {
			return false;
		}

		boolean ret = true;
		for (SimpleHash f : func) {
			ret = ret && bits.get(f.hash(str));
		}

		return ret;
	}

	/* 哈希函数类 */
	public static class SimpleHash {
		private int cap;
		private int seed;

		public SimpleHash(int cap, int seed) {
			this.cap = cap;
			this.seed = seed;
		}

		// hash函数,采用简单的加权和hash
		public int hash(String value) {
			int result = 0;
			int len = value.length();
			for (int i = 0; i < len; i++) {
				result = seed * result + value.charAt(i);
			}
			return (cap - 1) & result;
		}
	}
}

5.3 初始化布隆过滤器

package com.bjc.config;

import java.util.List;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import com.bjc.biz.CarTestService;
import com.bjc.utils.TitleFilter;

@Configuration
public class TitleFilterCfg {

	@Autowired
	private CarTestService carTestService;

	@Bean
	public TitleFilter titleFilter() {
		// 创建车辆标题过滤器
		TitleFilter titleFilter = new TitleFilter();

		// 从数据库查询车辆标题,分页查询
		List<String> list = carTestService.queryByPage(1, 5000);

		// 遍历查询结果
		for (String str : list) {
			// 把查询到的数据放到过滤器中
			titleFilter.add(str);
		}

		// 返回创建好的过滤器
		return titleFilter;
	}

}

5.4 业务代码编写

5.4.1 Mapper

package com.bjc.mapper;

import java.util.List;
import java.util.Map;

import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;

import com.bjc.pojo.CarTest;

@Mapper
public interface CarMapper {
	
	@Select("SELECT title FROM car_test LIMIT #{start},#{rows}")
	List<String> queryByPage(Map<String, Object> map);

	/**
	 * 保存
	 * 
	 * @param carTest
	 */
	@Insert(
	"INSERT INTO `car_test` (" +
	"	`title`," +
	"	`test_speed`," +
	"	`test_brake`," +
	"	`test_oil`," +
	"	`editor_name1`," +
	"	`editor_remark1`," +
	"	`editor_name2`," +
	"	`editor_remark2`," +
	"	`editor_name3`," +
	"	`editor_remark3`," +
	"	`image`," +
	"	`created`," +
	"	`updated`" +
	")" +
	"VALUES" +
	"	(" +
	"		#{title}," +
	"		#{test_speed}," +
	"		#{test_brake}," +
	"		#{test_oil}," +
	"		#{editor_name1}," +
	"		#{editor_remark1}," +
	"		#{editor_name2}," +
	"		#{editor_remark2}," +
	"		#{editor_name3}," +
	"		#{editor_remark3}," +
	"		#{image}," +
	"		#{created}," +
	"		#{updated}" +
	"	)")
	void save(CarTest carTest);

}

5.4.2 service

package com.bjc.biz.impl;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.annotation.Resource;

import org.springframework.stereotype.Service;

import com.bjc.biz.CarTestService;
import com.bjc.mapper.CarMapper;
import com.bjc.pojo.CarTest;

@Service
public class CarTestServiceImpl implements CarTestService {
	
	@Resource
	private CarMapper carMapper;

	@Override
	public List<String> queryByPage(int i, int j) {
		int start = (i-1) * j;
		Map<String,Object> map = new HashMap<String,Object>();
		map.put("start", start);
		map.put("rows", j);
		return carMapper.queryByPage(map);
	}

	@Override
	public void saveCarTest(CarTest carTest) {
		this.carMapper.save(carTest);
	}

}

 

6. 实现爬虫

6.1 测试方法

package com.bjc.test.demo;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.bjc.Application;
import com.bjc.biz.APIService;
import com.bjc.biz.CarTestService;
import com.bjc.pojo.CarTest;
import com.bjc.utils.TitleFilter;

@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest(classes = Application.class)
public class CrawlerAutohome {

	@Autowired
	private APIService apiService;

	@Autowired
	private CarTestService carTestService;

	@Autowired
	private TitleFilter titleFilter;

	@Test
	public void testCrawlerAutohome() throws Exception {
		//遍历所有的url
		for (int i = 1; i < 139; i++) {

			String html = apiService.getHtml("https://www.autohome.com.cn/bestauto/" + i);

			Document doc = Jsoup.parse(html);

			// 获取每获取评测信息
			Elements cars = doc.select("#bestautocontent div.uibox");

			// 遍历评测信息
			for (Element car : cars) {
				// 去重判读
				String title = car.getElementsByClass("uibox-title uibox-title-border").text();
				if (titleFilter.contains(title)) {
					// 如果包含了,就不保存了,遍历下一个
					continue;
				}

				if(null != car) {
					try {
						// 创建评测对象,封装数据
						CarTest carTest = copyCarTest(car);
						
						// 评测图片,下载图片
						String image = getImage(car);
						
						// 设置图片
						carTest.setImage(image);
						
						// 保存数据
						saveCarTest(carTest);
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
			}
		}
	}

	/**
	 * 保存汽车评测数据
	 * 
	 * @param carTest
	 */
	private void saveCarTest(CarTest carTest) {

		titleFilter.add(carTest.getTitle());

		carTestService.saveCarTest(carTest);

	}

	/**
	 * 解析数据下载评测图片
	 * 
	 * @param car
	 * @return
	 */
	private String getImage(Element car) {
		List<String> images = new ArrayList<String>();

		Elements elements = car.select(".piclist-box ul.piclist02 a");
		for (Element element : elements) {
			String url = "https:" + element.attr("href");

			String html = apiService.getHtml(url);
			Document doc = Jsoup.parse(html);
			String picUrl = "https:" + doc.select("#img").attr("src");

			String image = apiService.getImage(picUrl);

			images.add(image);

			break;
		}

		return StringUtils.join(images, ",");
	}

	/**
	 * 解析数据封装成汽车评测对象
	 * 
	 * @param car
	 * @return
	 */
	private CarTest copyCarTest(Element car) {
		CarTest carTest = new CarTest();

		// 评测车辆标题
		String title = car.getElementsByClass("uibox-title uibox-title-border").text();
		carTest.setTitle(title);

		// 评测项目-加速(0-100公里/小时),单位毫秒
		String speed = car.select(".tabbox1 dd:nth-child(2) div.dd-div2").first().text();
		carTest.setTest_speed(strToNum(speed));

		// 评测项目-刹车(100-0公里/小时),单位毫米
		String brake = car.select(".tabbox1 dd:nth-child(3) div.dd-div2").first().text();
		carTest.setTest_brake(strToNum(brake));

		// 评测项目-实测油耗(升/100公里),单位毫升
		String oil = car.select(".tabbox1 dd:nth-child(4) div.dd-div2").first().text();
		carTest.setTest_oil(strToNum(oil));

		// 评测编辑1
		carTest.setEditor_name1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div1").first().text());
		// 点评内容1
		carTest.setEditor_remark1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div3").first().text());

		// 评测编辑2
		carTest.setEditor_name2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div1").first().text());
		// 点评内容2
		carTest.setEditor_remark2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div3").first().text());

		// 评测编辑3
		carTest.setEditor_name3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div1").first().text());
		// 点评内容3
		carTest.setEditor_remark3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div3").first().text());

		// 设置时间
		carTest.setCreated(new Date());
		carTest.setUpdated(carTest.getCreated());

		return carTest;
	}

	/**
	 * 把字符串去掉最后一个数,转为乘以1000的数字
	 * 
	 * @param speed
	 * @return
	 */
	private int strToNum(String str) {
		if(StringUtils.isEmpty(str)) {
			return 0;
		}
		try {
			// 字符串去掉随后一个数
			str = StringUtils.substring(str, 0, str.length() - 1);

			// 转换为小数并乘以1000
			Number num = Float.valueOf(str) * 1000;

			return num.intValue();
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println(str);
		}
		return 0;
	}

}

6.2 整合任务

把测试方法中的爬取数据代码改造为任务,再使用Quartz定时任务定时处理,就可以实现定时抓取汽车评测数据,能够获取最新的数据了

6.2.1 改造任务

package com.bjc.job;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.quartz.QuartzJobBean;

import com.bjc.biz.APIService;
import com.bjc.biz.CarTestService;
import com.bjc.pojo.CarTest;
import com.bjc.utils.TitleFilter;

//同步执行任务(单线程)
@DisallowConcurrentExecution
public class CrawlerAutohomeJob extends QuartzJobBean {

	private APIService apiService;
	private CarTestService carTestService;
	private TitleFilter titleFilter;

	@Override
	protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
		// 获取spring容器
		ApplicationContext applicationContext = (ApplicationContext) context.getJobDetail().getJobDataMap()
				.get("context");

		// 获取抓取数据服务
		this.apiService = applicationContext.getBean(APIService.class);
		// 获取汽车评测服务
		this.carTestService = applicationContext.getBean(CarTestService.class);
		// 获取过滤器
		this.titleFilter = applicationContext.getBean(TitleFilter.class);

		// 遍历所有的url
		for (int i = 1; i < 139; i++) {
			// 抓取页面数据
			String html = this.apiService.getHtml("https://www.autohome.com.cn/bestauto/" + i);

			// 使用jsoup解析为Document对象
			Document doc = Jsoup.parse(html);

			// 获取每获取评测信息
			Elements cars = doc.select("#bestautocontent div.uibox");

			// 遍历评测信息
			for (Element car : cars) {
				// 去重判读
				String title = car.getElementsByClass("uibox-title uibox-title-border").text();
				if (this.titleFilter.contains(title)) {
					// 如果包含了,就不保存了,遍历下一个
					continue;
				}
				if(null != car) {
					try {
						// 创建评测对象,封装数据
						CarTest carTest = this.copyCarTest(car);
						
						// 评测图片,下载图片
						String image = this.getImage(car);
						
						// 设置图片
						carTest.setImage(image);
						
						// 保存数据
						this.saveCarTest(carTest);
					} catch (Exception e) {
					}
				}
			}
		}

	}

	/**
	 * 保存汽车评测数据
	 * 
	 * @param carTest
	 */
	private void saveCarTest(CarTest carTest) {

		this.titleFilter.add(carTest.getTitle());

		this.carTestService.saveCarTest(carTest);

	}

	/**
	 * 解析数据下载评测图片
	 * 
	 * @param car
	 * @return
	 */
	private String getImage(Element car) {
		List<String> images = new ArrayList<String>();

		Elements elements = car.select(".piclist-box ul.piclist02 a");
		for (Element element : elements) {
			String url = "https:" + element.attr("href");

			String html = this.apiService.getHtml(url);
			Document doc = Jsoup.parse(html);
			String picUrl = "https:" + doc.select("#img").attr("src");

			String image = this.apiService.getImage(picUrl);

			images.add(image);

			break;
		}

		return images.toString();
	}

	/**
	 * 解析数据封装成汽车评测对象
	 * 
	 * @param car
	 * @return
	 */
	private CarTest copyCarTest(Element car) {
		CarTest carTest = new CarTest();

		// 评测车辆标题
		String title = car.getElementsByClass("uibox-title uibox-title-border").text();
		carTest.setTitle(title);

		// 评测项目-加速(0-100公里/小时),单位毫秒
		String speed = car.select(".tabbox1 dd:nth-child(2) div.dd-div2").first().text();
		carTest.setTest_speed(this.strToNum(speed));

		// 评测项目-刹车(100-0公里/小时),单位毫米
		String brake = car.select(".tabbox1 dd:nth-child(3) div.dd-div2").first().text();
		carTest.setTest_brake(this.strToNum(brake));

		// 评测项目-实测油耗(升/100公里),单位毫升
		String oil = car.select(".tabbox1 dd:nth-child(4) div.dd-div2").first().text();
		carTest.setTest_oil(this.strToNum(oil));

		// 评测编辑1
		carTest.setEditor_name1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div1").first().text());
		// 点评内容1
		carTest.setEditor_remark1(car.select(".tabbox2 dd:nth-child(2) > div.dd-div3").first().text());

		// 评测编辑2
		carTest.setEditor_name2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div1").first().text());
		// 点评内容2
		carTest.setEditor_remark2(car.select(".tabbox2 dd:nth-child(3) > div.dd-div3").first().text());

		// 评测编辑3
		carTest.setEditor_name3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div1").first().text());
		// 点评内容3
		carTest.setEditor_remark3(car.select(".tabbox2 dd:nth-child(4) > div.dd-div3").first().text());

		// 设置时间
		carTest.setCreated(new Date());
		carTest.setUpdated(carTest.getCreated());

		return carTest;
	}

	/**
	 * 把字符串去掉最后一个数,转为乘以1000的数字
	 * 
	 * @param speed
	 * @return
	 */
	private int strToNum(String str) {
		
		try {
			// 字符串去掉随后一个数
			str = StringUtils.substring(str, 0, str.length() - 1);

			// 转换为小数并乘以1000
			Number num = Float.valueOf(str) * 1000;

			return num.intValue();
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println(str);
		}
		return 0;
	}

}

6.2.2 定时任务处理

package com.bjc.config;

import org.quartz.CronTrigger;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.quartz.CronTriggerFactoryBean;
import org.springframework.scheduling.quartz.JobDetailFactoryBean;
import org.springframework.scheduling.quartz.SchedulerFactoryBean;

import com.bjc.job.CloseConnectJob;
import com.bjc.job.CrawlerAutohomeJob;

@Configuration
public class SchedledAutoHomeCfg {
	// 定义关闭无效连接任务
		@Bean("crawlerAutohomeJobBean")
		public JobDetailFactoryBean crawlerAutohomeJobBean() {
			JobDetailFactoryBean jobDetailFactoryBean = new JobDetailFactoryBean();
			jobDetailFactoryBean.setApplicationContextJobDataKey("context");
			jobDetailFactoryBean.setJobClass(CrawlerAutohomeJob.class);
			jobDetailFactoryBean.setDurability(true);

			return jobDetailFactoryBean;
		}

		// 定义关闭无效连接触发器
		@Bean("crawlerAutohomeJobTrigger")
		public CronTriggerFactoryBean crawlerAutohomeJobTrigger(
				@Qualifier(value = "crawlerAutohomeJobBean") JobDetailFactoryBean itemJobBean) {
			CronTriggerFactoryBean tigger = new CronTriggerFactoryBean();
			tigger.setJobDetail(itemJobBean.getObject());
			tigger.setCronExpression("0/5 * * * * ? ");
			return tigger;
		}
		// 定义关闭无效连接任务
		@Bean("closeConnectJobBean")
		public JobDetailFactoryBean closeConnectJobBean() {
			JobDetailFactoryBean jobDetailFactoryBean = new JobDetailFactoryBean();
			jobDetailFactoryBean.setApplicationContextJobDataKey("context");
			jobDetailFactoryBean.setJobClass(CloseConnectJob.class);
			jobDetailFactoryBean.setDurability(true);
			
			return jobDetailFactoryBean;
		}
		
		// 定义关闭无效连接触发器
		@Bean("closeConnectJobTrigger")
		public CronTriggerFactoryBean closeConnectJobTrigger(
				@Qualifier(value = "closeConnectJobBean") JobDetailFactoryBean itemJobBean) {
			CronTriggerFactoryBean tigger = new CronTriggerFactoryBean();
			tigger.setJobDetail(itemJobBean.getObject());
			tigger.setCronExpression("0/5 * * * * ? ");
			return tigger;
		}

		// 定义调度器
		@Bean
		public SchedulerFactoryBean schedulerFactory(CronTrigger[] cronTriggerImpl) {
			SchedulerFactoryBean bean = new SchedulerFactoryBean();
			bean.setTriggers(cronTriggerImpl);
			return bean;
		}

}

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值