多线程爬取京东商城数据
pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>spring-boot-starter-parent</artifactId>
<groupId>org.springframework.boot</groupId>
<version>2.0.1.RELEASE</version>
<relativePath />
</parent>
<groupId>org.example</groupId>
<artifactId>mySpider</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>2.4.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>1.2.4</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<version>2.4.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.22</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.73</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>22.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<!-- MyBatis相关依赖 -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.4.1</version>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-generator</artifactId>
<version>3.4.1</version>
</dependency>
<!-- mybatis-plus-generator模版依赖 -->
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
<version>2.3.28</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.4</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
yaml 文件
spring:
datasource:
url: jdbc:mysql://localhost:3306/spider?useUnicode=true&characterEncoding=utf8&serverTimezone=UTC
driver-class-name: com.mysql.cj.jdbc.Driver
username: root
password: root
dbcp2:
min-idle: 5
initial-size: 5
max-total: 5
max-wait-millis: 100
redis:
database: 0
host: 127.0.0.1
port: 6379
password: 123456
mysql表
/*
Navicat Premium Data Transfer
Source Server : mysql
Source Server Type : MySQL
Source Server Version : 50719
Source Host : localhost:3306
Source Schema : spider
Target Server Type : MySQL
Target Server Version : 50719
File Encoding : 65001
Date: 03/05/2022 16:39:54
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for goods_info
-- ----------------------------
DROP TABLE IF EXISTS `goods_info`;
CREATE TABLE `goods_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`goods_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
`goods_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
`goods_price` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
`img_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
代码结构
技术点
1.SpringBoot
2.SpringMVC
3.HttpClinet
4.Jsoup
5.多线程(线程池)
6.redis
7.mysql(mybatis-plus)
项目流程
1.启动类进行启动,开启 @PostConstruct
,调用spiderHandle
2.spiderHandle
在spiderHandle中,使用线程池处理任务,线程池工厂和拒绝策略由自己确定;同时引入countDownLatch进行线程同步,使主线程等待线程池的所有任务结束,便于计时。
3.SpiderService处理爬取数据,进行解析,同时批量插入到数据库中.注意:因为是多线程成爬取,如果爬取的数据需要存入集合,需要采用并发安全的List,这里使用了synchronized锁
4.GoodsInfoMapper
是使用mybatis-plus生成得到
5.Redis队列
因为在爬取的时候可能因为网络等原因,爬取的那一条数据会失败。因此,将爬取的页码放入到redis中。
redisTemplate.opsForList().leftPush("page",parms.get("page"));
我在后台重新启动一个线程,自旋的形式将Redis的队列中的数据阻塞式取出。然后再一次爬取。
for (int i = 1; i < 201; i += 2) {
Map<String, String> params = new HashMap<>();
params.put("keyword", "零食");
params.put("enc", "utf-8");
params.put("wc", "零食");
params.put("page", i + "");
threadPoolExecutor.execute(() -> {
goodsInfoService.spiderData(SysConstant.BASE_URL, params);
countDownLatch.countDown();
});
}
爬虫思路:
源代码地址
源代码在本人github上:多线程爬虫