团队目前数据库持久层用的都是Spring Data Jpa。jpa自带的批量保存数据saveAll()方法,本质上是循环调用save()方法。随着一些高频读写数据库的业务出现,写数据速度逐渐不满足要求。有三种方式解决这个问题:
通过扩展jpa实现批量保存数据;
使用jdbc操作批量数据;
换MyBatis。
1. 直接上代码
实体类:
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import java.util.Date;
@Entity
@Table(name = "demo_batch")
public class DemoBatch {
private Long id;
private String name;
private String content;
private Date createTime;
private Date updateTime;
public DemoBatch() {
}
public DemoBatch(String name, String content) {
this.name = name;
this.content = content;
}
@Id
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Date getCreateTime() {
return createTime;
}
public void setCreateTime(Date createTime) {
this.createTime = createTime;
}
public Date getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Date updateTime) {
this.updateTime = updateTime;
}
}
1.1 扩展jpa
扩展JpaRepository接口:
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
import org.springframework.data.repository.NoRepositoryBean;
import java.io.Serializable;
@NoRepositoryBean
public interface BaseRepository extends JpaRepository, JpaSpecificationExecutor {
Iterable batchInsert(Iterable var1);
Iterable batchUpdate(Iterable var1);
}
实现扩展的接口方法:
import org.springframework.data.jpa.repository.Modifying;
import org.springframework.data.jpa.repository.support.JpaEntityInformation;
import org.springframework.data.jpa.repository.support.SimpleJpaRepository;
import org.springframework.transaction.annotation.Transactional;
import javax.persistence.EntityManager;
import java.io.Serializable;
import java.util.Iterator;
@SuppressWarnings("SpringJavaConstructorAutowiringInspection")
public class BaseRepositoryImpl extends SimpleJpaRepository implements BaseRepository {
private static final int BATCH_SIZE = 20000;
private EntityManager entityManager;
public BaseRepositoryImpl(JpaEntityInformation entityInformation, EntityManager entityManager) {
super(entityInformation, entityManager);
this.entityManager = entityManager;
}
public BaseRepositoryImpl(Class domainClass, EntityManager entityManager) {
super(domainClass, entityManager);
this.entityManager = entityManager;
}
@Override
@Transactional
public Iterable batchInsert(Iterable var1) {
Iterator iterator = var1.iterator();
int index = 0;
while (iterator.hasNext()){
entityManager.persist(iterator.next());
index++;
if (index % BATCH_SIZE == 0){
entityManager.flush();
entityManager.clear();
}
}
if (index % BATCH_SIZE != 0){
entityManager.flush();
entityManager.clear();
}
return var1;
}
@Override
@Transactional
public Iterable batchUpdate(Iterable var1) {
Iterator iterator = var1.iterator();
int index = 0;
while (iterator.hasNext()){
entityManager.merge(iterator.next());
index++;
if (index % BATCH_SIZE == 0){
entityManager.flush();
entityManager.clear();
}
}
if (index % BATCH_SIZE != 0){
entityManager.flush();
entityManager.clear();
}
return var1;
}
}
在启动类Application上加上注解:@EnableJpaRepositories(repositoryBaseClass = BaseRepositoryImpl.class)
通过代码里面的BATCH_SIZE设置批量大小。
1.2 使用jdbc
使用Spring封装的jdbcTemplate
jdbcTemplate.batchUpdate(
"INSERT INTO `demo_batch` (`id`, `name`, `content`, `create_time`, `update_time`) VALUES (?, ?, ?, ?, ?);",
new BatchPreparedStatementSetter() {
@Override
public void setValues(PreparedStatement preparedStatement, int i) throws SQLException {
preparedStatement.setLong(1, list.get(i).getId());
preparedStatement.setString(2, list.get(i).getName());
preparedStatement.setString(3, list.get(i).getContent());
preparedStatement.setTimestamp(4, new Timestamp(list.get(i).getCreateTime().getTime()));
preparedStatement.setTimestamp(5, new Timestamp(list.get(i).getUpdateTime().getTime()));
}
@Override
public int getBatchSize() {
return list.size();
}
});
通过配置文件的spring.jpa.jdbc.batch_size设置批量大小。
MyBatis暂时不研究。
2. 测试代码
import com.example.demo.jpa.dao.DemoBatchDao;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.jdbc.core.BatchPreparedStatementSetter;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.test.annotation.Rollback;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.transaction.annotation.Transactional;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest(classes = DemoJpaApplication.class)
@Transactional
public class DemoJpaApplicationTests {
@Autowired
private DemoBatchDao demoBatchDao;
@Autowired
private JdbcTemplate jdbcTemplate;
@Test
@Rollback(value = false)
public void testSingle() {
int count = 10000;
List list = new ArrayList<>();
for (long i = 1; i <= count; i++) {
DemoBatch demoBatch = new DemoBatch("aaa", "bbb");
demoBatch.setCreateTime(new Date());
demoBatch.setUpdateTime(new Date());
demoBatch.setId(i);
list.add(demoBatch);
}
long startTime = System.currentTimeMillis();
for (int i = 0; i < count; i++) {
demoBatchDao.save(list.get(i));
}
System.out.println(System.currentTimeMillis() - startTime);
}
@Test
@Rollback(value = false)
public void testBatch() {
int count = 100;
List list = new ArrayList<>();
for (long i = 1; i <= count; i++) {
DemoBatch demoBatch = new DemoBatch("aaa", "bbb");
demoBatch.setCreateTime(new Date());
demoBatch.setUpdateTime(new Date());
demoBatch.setId(i);
list.add(demoBatch);
}
long startTime = System.currentTimeMillis();
demoBatchDao.batchInsert(list);
System.out.println(System.currentTimeMillis() - startTime);
}
@Test
@Rollback(value = false)
public void testJdbcTemplate() {
int count = 10000;
List list = new ArrayList<>();
for (long i = 1; i <= count; i++) {
DemoBatch demoBatch = new DemoBatch("aaa", "bbb");
demoBatch.setCreateTime(new Date());
demoBatch.setUpdateTime(new Date());
demoBatch.setId(i);
list.add(demoBatch);
}
long startTime = System.currentTimeMillis();
jdbcTemplate.batchUpdate(
"INSERT INTO `demo_batch` (`id`, `name`, `content`, `create_time`, `update_time`) VALUES (?, ?, ?, ?, ?);",
new BatchPreparedStatementSetter() {
@Override
public void setValues(PreparedStatement preparedStatement, int i) throws SQLException {
preparedStatement.setLong(1, list.get(i).getId());
preparedStatement.setString(2, list.get(i).getName());
preparedStatement.setString(3, list.get(i).getContent());
preparedStatement.setTimestamp(4, new Timestamp(list.get(i).getCreateTime().getTime()));
preparedStatement.setTimestamp(5, new Timestamp(list.get(i).getUpdateTime().getTime()));
}
@Override
public int getBatchSize() {
return list.size();
}
});
System.out.println(System.currentTimeMillis() - startTime);
}
@Test
@Rollback(value = false)
public void testJdbc() {
int count = 10000;
List list = new ArrayList<>();
for (long i = 1; i <= count; i++) {
DemoBatch demoBatch = new DemoBatch("aaa", "bbb");
demoBatch.setCreateTime(new Date());
demoBatch.setUpdateTime(new Date());
demoBatch.setId(i);
list.add(demoBatch);
}
long startTime = System.currentTimeMillis();
jdbcTemplate.batchUpdate(
"INSERT INTO `demo_batch` (`id`, `name`, `content`, `create_time`, `update_time`) VALUES (?, ?, ?, ?, ?);",
new BatchPreparedStatementSetter() {
@Override
public void setValues(PreparedStatement preparedStatement, int i) throws SQLException {
preparedStatement.setLong(1, list.get(i).getId());
preparedStatement.setString(2, list.get(i).getName());
preparedStatement.setString(3, list.get(i).getContent());
preparedStatement.setTimestamp(4, new Timestamp(list.get(i).getCreateTime().getTime()));
preparedStatement.setTimestamp(5, new Timestamp(list.get(i).getUpdateTime().getTime()));
}
@Override
public int getBatchSize() {
return list.size();
}
});
System.out.println(System.currentTimeMillis() - startTime);
}
}
3. 测试结果
原生jpa
原生jpa带事务
batch扩展jpa-size500
batch扩展jpa-size2000
batch扩展jpa-size20000
jdbc-size500
jdbc-size20000
100条记录
568
264
222
228
347
200
158
1000条记录
3564
1095
1050
947
992
1975
1218
10000条记录
23942
4924
3489
3828
3379
11875
11901
通过观察这个测试结果,我们可以得到一个结论,所谓的批量,其实就是多个插入放进一个事务里面,和jpa的save()方法加事务效果差不多。
我们打开MySQL的日志(命令:SET GLOBAL general_log = 'ON';),查看SQL语句记录。
3.1 关闭事务(测试方法去掉@Transactional注解),执行10次save()
706 Query SET autocommit=0
706 Query select demobatch0_.id as id1_3_0_, demobatch0_.content as content2_3_0_, demobatch0_.create_time as create_t3_3_0_, demobatch0_.name as name4_3_0_, demobatch0_.update_time as update_t5_3_0_ from demo_batch demobatch0_ where demobatch0_.id=1
706 Query SELECT @@session.tx_read_only
706 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:40:43.003', 'aaa', '2020-03-29 15:40:43.003', 1)
706 Query commit
706 Query SET autocommit=1
706 Query SET autocommit=0
706 Query select demobatch0_.id as id1_3_0_, demobatch0_.content as content2_3_0_, demobatch0_.create_time as create_t3_3_0_, demobatch0_.name as name4_3_0_, demobatch0_.update_time as update_t5_3_0_ from demo_batch demobatch0_ where demobatch0_.id=2
706 Query SELECT @@session.tx_read_only
706 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:40:43.003', 'aaa', '2020-03-29 15:40:43.003', 2)
706 Query commit
706 Query SET autocommit=1
......(此处省略七组重复语句)
重复执行了10次SQL插入。save()每次保存之前会查询一次,确定是插入还是更新数据,这是jpa的特点。
3.2 打开事务(测试方法加上@Transactional注解),执行10次save()
716 Query SET autocommit=0
716 Query select demobatch0_.id as id1_3_0_, demobatch0_.content as content2_3_0_, demobatch0_.create_time as create_t3_3_0_, demobatch0_.name as name4_3_0_, demobatch0_.update_time as update_t5_3_0_ from demo_batch demobatch0_ where demobatch0_.id=1
716 Query select demobatch0_.id as id1_3_0_, demobatch0_.content as content2_3_0_, demobatch0_.create_time as create_t3_3_0_, demobatch0_.name as name4_3_0_, demobatch0_.update_time as update_t5_3_0_ from demo_batch demobatch0_ where demobatch0_.id=2
716 Query select demobatch0_.id as id1_3_0_, demobatch0_.content as content2_3_0_, demobatch0_.create_time as create_t3_3_0_, demobatch0_.name as name4_3_0_, demobatch0_.update_time as update_t5_3_0_ from demo_batch demobatch0_ where demobatch0_.id=3
......(此处省略七组重复语句)
716 Query SELECT @@session.tx_read_only
716 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:47:28.875', 'aaa', '2020-03-29 15:47:28.875', 1)
716 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:47:28.875', 'aaa', '2020-03-29 15:47:28.875', 2)
716 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:47:28.875', 'aaa', '2020-03-29 15:47:28.875', 3)
......(此处省略七组重复语句)
716 Query commit
716 Query SET autocommit=1
10次执行放在了一个事务中。多了一组select查询,所以执行时间是扩展jpa和jdbc顶封两倍。
3.3 扩展jpa的batchDSave()方法
726 Query SET autocommit=0
726 Query SELECT @@session.tx_read_only
726 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:49:51.358', 'aaa', '2020-03-29 15:49:51.358', 1)
726 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:49:51.358', 'aaa', '2020-03-29 15:49:51.358', 2)
726 Query insert into demo_batch (content, create_time, name, update_time, id) values ('bbb', '2020-03-29 15:49:51.358', 'aaa', '2020-03-29 15:49:51.358', 3)
......(此处省略七组重复语句)
726 Query commit
726 Query SET autocommit=1
可以看出与3.1的区别就是指明了插入方法,不需要查询数据库去判断插入还是更新。
3.4jdbc批量保存方法
736 Query SELECT @@session.tx_read_only
736 Query INSERT INTO `demo_batch` (`id`, `name`, `content`, `create_time`, `update_time`) VALUES (1, 'aaa', 'bbb', '2020-03-29 15:54:31.999', '2020-03-29 15:54:31.999')
736 Query INSERT INTO `demo_batch` (`id`, `name`, `content`, `create_time`, `update_time`) VALUES (2, 'aaa', 'bbb', '2020-03-29 15:54:31.999', '2020-03-29 15:54:31.999')
736 Query INSERT INTO `demo_batch` (`id`, `name`, `content`, `create_time`, `update_time`) VALUES (3, 'aaa', 'bbb', '2020-03-29 15:54:31.999', '2020-03-29 15:54:31.999')
......(此处省略七组重复语句)
736 Query commit
736 Query SET autocommit=1
与3.2一摸一样
4. 总结
原以为批量保存是写一个长SQL,原来是通过事务实现的。(MyBatis不清楚);
扩展jpa中直接调用EntityManager的persist()和merge()方法,直接指明了是insert还是update,相比save()方法加事务的操作,好处是每次插入数据都节省了一次查询操作。