使用SpringBatch框架将海量数据文件中数据批量导入Cassandra数据库中
- SpringBatch通过springbatch.xml中 batch:job标签具体定义job工作内容
- step job内步骤项 这里只需要一个就可以
- tasklet 任务集,可以指定线程池来执行
- chunk : read-process-write模式 读csv文件,程序处理,然后写入Cassandra
- commit-interval : 事务隔多少次提交
<batch:job id="integrationTradingDate">
<batch:step id="readTradingDateWriteToDatebase">
<batch:tasklet task-executor="taskExecutor"
throttle-limit="20">
<batch:chunk reader="inputDateReader" writer="ProductItemWriter"
processor="itemProcessor" commit-interval="12">
</batch:chunk>
</batch:tasklet>
</batch:step>
</batch:job>
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>dataIntegration</artifactId>
<groupId>com.beifa.cn.data.integration</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>tradingDate</artifactId>
<packaging>jar</packaging>
<name>A Camel Route</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencyManagement>
<dependencies>
<!-- Camel BOM -->
<dependency>
<groupId>org.apache.camel</groupId>
<artifactId>camel-parent</artifactId>
<version>2.22.0</version>
<scope>import</scope>
<type>pom</type>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.camel</groupId>
<artifactId>camel-core</artifactId>
</dependency>
<!-- logging -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<scope>runtime</scope>
</dependency>
<!-- testing -->
<dependency>
<groupId>org.apache.camel</groupId>
<artifactId>camel-test</artifactId>
<scope>test</scope>
</dependency>
<!-- spring -->
<dependency>
<groupId>org.springframework.batch</groupId>
<artifactId>spring-batch-core</artifactId>
<version>4.0.1.RELEASE</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.springframework</groupId>-->
<!--<artifactId>spring-jdbc</artifactId>-->
<!--<version>3.2.8.RELEASE</version>-->
<!--</dependency>-->
<!--<dependency>-->
<!--<groupId>org.springframework.data</groupId>-->
<!--<artifactId>spring-data-jpa</artifactId>-->
<!--<version>1.4.1.RELEASE</version>-->
<!--</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-cassandra</artifactId>
<version>2.0.8.RELEASE</version>
</dependency>
-->
<!--cassandra-->
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-cassandra</artifactId>
<version>2.0.8.RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.cassandra</groupId>
<artifactId>cassandra-all</artifactId>
<version>3.11.2</version>
</dependency>
</dependencies>
<build>
<defaultGoal>install</defaultGoal>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- Allows the example to be run via 'mvn compile exec:java' -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>com.beifa.cn.data.integration.MainApp</mainClass>
<includePluginDependencies>false</includePluginDependencies>
</configuration>
</plugin>
</plugins>
</build>
</project>
applicationContext.xml
<!-- spring -->
<!-- 引入属性文件 -->
<context:property-placeholder location="classpath:cassandra.properties"/>
<!-- 自动扫描(自动注入) -->
<context:component-scan base-package="com.*"/>
<import resource="springbatch.xml"/>
<import resource="cassandra.xml"/>
springbatch.xml
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:batch="http://www.springframework.org/schema/batch"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/batch
http://www.springframework.org/schema/batch/spring-batch.xsd
">
<!-- spring batch -->
<bean id="jobRepository"
class="org.springframework.batch.core.repository.support.MapJobRepositoryFactoryBean">
<property name="transactionManager" ref="transactionManager"/>
</bean>
<bean id="transactionManager"
class="org.springframework.batch.support.transaction.ResourcelessTransactionManager"/>
<bean id="jobLauncher"
class="org.springframework.batch.core.launch.support.SimpleJobLauncher">
<property name="jobRepository" ref="jobRepository"/>
</bean>
<batch:job id="integrationTradingDate">
<batch:step id="readTradingDateWriteToDatebase">
<batch:tasklet task-executor="taskExecutor"
throttle-limit="20">
<batch:chunk reader="inputDateReader" writer="ProductItemWriter"
processor="itemProcessor" commit-interval="12">
</batch:chunk>
</batch:tasklet>
</batch:step>
</batch:job>
<bean id="taskExecutor" class="org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor">
<property name="corePoolSize" value="60" />
<property name="maxPoolSize" value="100" />
<property name="queueCapacity" value="25" />
</bean>
<bean id="itemProcessor" class="com.beifa.cn.data.integration.spring.batch.reader.processor.TradingDateProcessor"/>
<bean id="inputDateReader" class="org.springframework.batch.item.file.FlatFileItemReader">
<property name="resource" value="classpath:input/input.csv"/>
<property name="lineMapper" ref="lineMapper"/>
<property name="linesToSkip" value="1"/>
</bean>
<bean id="lineMapper"
class="org.springframework.batch.item.file.mapping.DefaultLineMapper">
<property name="lineTokenizer">
<bean
class="org.springframework.batch.item.file.transform.DelimitedLineTokenizer">
<property name="names" value="stockCode,itemValue1,itemValue2,itemValue3"/>
</bean>
</property>
<property name="fieldSetMapper" ref="fieldSetMapper"/>
</bean>
<bean id="fieldSetMapper"
class="com.beifa.cn.data.integration.spring.batch.reader.mapper.TradeDateFieldSetMapper">
</bean>
<bean id="csvItemWriter" class="org.springframework.batch.item.file.FlatFileItemWriter">
<property name="resource" value="file:target/output/output.csv" />
<property name="lineAggregator">
<bean class="org.springframework.batch.item.file.transform.DelimitedLineAggregator">
<property name="delimiter" value=","/>
<property name="fieldExtractor">
<bean class="org.springframework.batch.item.file.transform.BeanWrapperFieldExtractor">
<property name="names" value="item_id,tradingDataDate,stockCode,itemValue1,itemValue2,itemValue3"/>
</bean>
</property>
</bean>
</property>
</bean>
<bean id="ProductItemWriter" class="com.beifa.cn.data.integration.spring.batch.writer.cassandra.TradeDateCassandraWriter"/>
</beans>
cassandra.properties
cassandra_contactpoints = 192.168.13.163
#�˿�
cassandra_port = 9042
#��ǰ�������ռ�
cassandra_keyspace = tradingdata
#��¼�û���
cassandra_username= cassandra
#��¼����
cassandra_password= cassandra
cassandra.xml
<!-- spring-cassandra -->
<cassandra:cluster contact-points="${cassandra_contactpoints}" port="${cassandra_port}"
username="${cassandra_username}" password="${cassandra_password}"/>
<!-- 当前使用scheam -->
<cassandra:session keyspace-name="${cassandra_keyspace}"/>
<!-- orm -->
<cassandra:mapping/>
<!-- 类型转换 -->
<cassandra:converter/>
<!-- cassandra operater cqlTemplate -->
<cassandra:template id="cassandraTemplate"/>
<!-- spring data 接口 -->
<cassandra:repositories base-package="com.beifa.cn.data.integration.spring.repository"/>
实体类
/**
* @Author: duhongjiang
* @Date: Created in 2018/7/13
*/
@Table(value = "time_series_data")
public class TradeDate implements Serializable {
private static final long serialVersionUID =1L;
@PrimaryKey(value="item_id")
private UUID itemId;
@Column(value="trading_date")
private LocalDate tradingDataDate;
@Column(value="stock_code")
private String stockCode;
@Column(value="item_value1")
private Double itemValue1;
@Column(value="item_value2")
private Double itemValue2;
@Column(value="item_value3")
private Double itemValue3;
public TradeDate(){
}
public TradeDate(String stockCode, Double itemValue1, Double itemValue2, Double itemValue3) {
this.stockCode = stockCode;
this.itemValue1 = itemValue1;
this.itemValue2 = itemValue2;
this.itemValue3 = itemValue3;
}
public TradeDate(UUID itemId, LocalDate tradingDataDate, String stockCode, Double itemValue1, Double itemValue2, Double itemValue3) {
this.itemId = itemId;
this.tradingDataDate = tradingDataDate;
this.stockCode = stockCode;
this.itemValue1 = itemValue1;
this.itemValue2 = itemValue2;
this.itemValue3 = itemValue3;
}
public UUID getItemId() {
return itemId;
}
public void setItemId(UUID itemId) {
this.itemId = itemId;
}
public LocalDate getTradingDataDate() {
return tradingDataDate;
}
public void setTradingDataDate(LocalDate tradingDataDate) {
this.tradingDataDate = tradingDataDate;
}
public String getStockCode() {
return stockCode;
}
public void setStockCode(String stockCode) {
this.stockCode = stockCode;
}
public Double getItemValue1() {
return itemValue1;
}
public void setItemValue1(Double itemValue1) {
this.itemValue1 = itemValue1;
}
public Double getItemValue2() {
return itemValue2;
}
public void setItemValue2(Double itemValue2) {
this.itemValue2 = itemValue2;
}
public Double getItemValue3() {
return itemValue3;
}
public void setItemValue3(Double itemValue3) {
this.itemValue3 = itemValue3;
}
}
映射类:读取csv数据文件映射到实体类 fieldSetMapper
public class TradeDateFieldSetMapper implements FieldSetMapper<TradeDate> {
@Override
public TradeDate mapFieldSet(FieldSet fieldSet) throws BindException {
TradeDate tradeDate =new TradeDate();
tradeDate.setTradingDataDate(LocalDate.fromMillisSinceEpoch(System.currentTimeMillis()));
tradeDate.setItemId(UUID.randomUUID());
tradeDate.setStockCode(fieldSet.readString("stockCode"));
tradeDate.setItemValue1(fieldSet.readDouble("itemValue1"));
tradeDate.setItemValue2(fieldSet.readDouble("itemValue2"));
tradeDate.setItemValue3(fieldSet.readDouble("itemValue3"));
return tradeDate;
}
}
public class TradeDateCassandraWriter implements ItemWriter<TradeDate> {
@Autowired
TradeDateRepository tradeDateRepository;
@Override
public void write(List<? extends TradeDate> list) throws Exception {
for(TradeDate tradeDate:list){
tradeDateRepository.save(tradeDate);
}
}
}
public class TradingDateProcessor implements ItemProcessor<TradeDate,TradeDate> {
@Override
public TradeDate process(TradeDate tradeDate) throws Exception {
return tradeDate;
}
}
public interface TradeDateRepository extends CrudRepository<TradeDate,String> {
}