由于kettle采用了流的方式对数据进行处理,所以在速度方面具有一定的优势,在面对大量数据入库时,可以先把数据处理任务通过kettle软件开发出来,然后通过java调用kettle aip完成数据的方式继承到java代码中。
导入依赖
<dependencies>
<dependency>
<groupId>pentaho-kettle</groupId>
<artifactId>kettle-core</artifactId>
<version>7.1.0.0-12</version>
</dependency>
<dependency>
<groupId>pentaho-kettle</groupId>
<artifactId>kettle-engine</artifactId>
<version>7.1.0.0-12</version>
</dependency>
<!--mysql驱动 如果应用中使用到数据库连接,可以导入mysql相关依赖-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.37</version>
</dependency>
</dependencies>
准备测试用的kettle转换
转换的目标是把表ddd 中的数据写到 ddd_copy1 表中。两个表的结构完全相等
建表语句
CREATE TABLE `ddd` (
`id` int(11) NOT NULL,
`NAME` varchar(10) DEFAULT NULL,
`gender` char(1) DEFAULT NULL,
`salary` double DEFAULT NULL,
`join_date` date DEFAULT NULL,
`dept_id` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `dept_id` (`dept_id`),
CONSTRAINT `ddd_ibfk_1` FOREIGN KEY (`dept_id`) REFERENCES `dept` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `ddd_copy1` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`NAME` varchar(10) DEFAULT NULL,
`gender` char(1) DEFAULT NULL,
`salary` double DEFAULT NULL,
`join_date` date DEFAULT NULL,
`dept_id` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `dept_id` (`dept_id`),
CONSTRAINT `ddd_copy1_ibfk_1` FOREIGN KEY (`dept_id`) REFERENCES `dept` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=22 DEFAULT CHARSET=utf8;
数据库配置
数据库ip | port | username | password |
---|---|---|---|
localhost | 3306 | root | root |
数据库配置与下面的test.ktr中的配置相互匹配,不要随意更改
ktr转换
可以把下面的xml文件复制下来保存为 test.ktr文件11
<?xml version="1.0" encoding="UTF-8"?>
<transformation>
<info>
<name>test</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<directory>/</directory>
<parameters>
</parameters>
<log>
<trans-log-table>
<connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STATUS</id>
<enabled>Y</enabled>
<name>STATUS</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
<subject/>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
<subject/>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
<subject/>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
<subject/>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
<subject/>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
<subject/>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>STARTDATE</id>
<enabled>Y</enabled>
<name>STARTDATE</name>
</field>
<field>
<id>ENDDATE</id>
<enabled>Y</enabled>
<name>ENDDATE</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>DEPDATE</id>
<enabled>Y</enabled>
<name>DEPDATE</name>
</field>
<field>
<id>REPLAYDATE</id>
<enabled>Y</enabled>
<name>REPLAYDATE</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>Y</enabled>
<name>LOG_FIELD</name>
</field>
<field>
<id>EXECUTING_SERVER</id>
<enabled>N</enabled>
<name>EXECUTING_SERVER</name>
</field>
<field>
<id>EXECUTING_USER</id>
<enabled>N</enabled>
<name>EXECUTING_USER</name>
</field>
<field>
<id>CLIENT</id>
<enabled>N</enabled>
<name>CLIENT</name>
</field>
</trans-log-table>
<perf-log-table>
<connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>SEQ_NR</id>
<enabled>Y</enabled>
<name>SEQ_NR</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>INPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>INPUT_BUFFER_ROWS</name>
</field>
<field>
<id>OUTPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>OUTPUT_BUFFER_ROWS</name>
</field>
</perf-log-table>
<channel-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>LOGGING_OBJECT_TYPE</id>
<enabled>Y</enabled>
<name>LOGGING_OBJECT_TYPE</name>
</field>
<field>
<id>OBJECT_NAME</id>
<enabled>Y</enabled>
<name>OBJECT_NAME</name>
</field>
<field>
<id>OBJECT_COPY</id>
<enabled>Y</enabled>
<name>OBJECT_COPY</name>
</field>
<field>
<id>REPOSITORY_DIRECTORY</id>
<enabled>Y</enabled>
<name>REPOSITORY_DIRECTORY</name>
</field>
<field>
<id>FILENAME</id>
<enabled>Y</enabled>
<name>FILENAME</name>
</field>
<field>
<id>OBJECT_ID</id>
<enabled>Y</enabled>
<name>OBJECT_ID</name>
</field>
<field>
<id>OBJECT_REVISION</id>
<enabled>Y</enabled>
<name>OBJECT_REVISION</name>
</field>
<field>
<id>PARENT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>PARENT_CHANNEL_ID</name>
</field>
<field>
<id>ROOT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>ROOT_CHANNEL_ID</name>
</field>
</channel-log-table>
<step-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>N</enabled>
<name>LOG_FIELD</name>
</field>
</step-log-table>
<metrics-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>METRICS_DATE</id>
<enabled>Y</enabled>
<name>METRICS_DATE</name>
</field>
<field>
<id>METRICS_CODE</id>
<enabled>Y</enabled>
<name>METRICS_CODE</name>
</field>
<field>
<id>METRICS_DESCRIPTION</id>
<enabled>Y</enabled>
<name>METRICS_DESCRIPTION</name>
</field>
<field>
<id>METRICS_SUBJECT</id>
<enabled>Y</enabled>
<name>METRICS_SUBJECT</name>
</field>
<field>
<id>METRICS_TYPE</id>
<enabled>Y</enabled>
<name>METRICS_TYPE</name>
</field>
<field>
<id>METRICS_VALUE</id>
<enabled>Y</enabled>
<name>METRICS_VALUE</name>
</field>
</metrics-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies>
</dependencies>
<partitionschemas>
</partitionschemas>
<slaveservers>
</slaveservers>
<clusterschemas>
</clusterschemas>
<created_user>-</created_user>
<created_date>2023/03/28 18:57:02.367</created_date>
<modified_user>-</modified_user>
<modified_date>2023/03/28 18:57:02.367</modified_date>
<key_for_session_key/>
<is_key_private>N</is_key_private>
</info>
<notepads>
</notepads>
<connection>
<name>db_config</name>
<server>localhost</server>
<type>MYSQL</type>
<access>Native</access>
<database>db1</database>
<port>3306</port>
<username>root</username>
<password>Encrypted 2be98afc86aa7f2e4cb79ce10cc9da0ce</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute>
<code>FORCE_IDENTIFIERS_TO_LOWERCASE</code>
<attribute>N</attribute>
</attribute>
<attribute>
<code>FORCE_IDENTIFIERS_TO_UPPERCASE</code>
<attribute>N</attribute>
</attribute>
<attribute>
<code>IS_CLUSTERED</code>
<attribute>N</attribute>
</attribute>
<attribute>
<code>PORT_NUMBER</code>
<attribute>3306</attribute>
</attribute>
<attribute>
<code>PRESERVE_RESERVED_WORD_CASE</code>
<attribute>Y</attribute>
</attribute>
<attribute>
<code>QUOTE_ALL_FIELDS</code>
<attribute>N</attribute>
</attribute>
<attribute>
<code>STREAM_RESULTS</code>
<attribute>Y</attribute>
</attribute>
<attribute>
<code>SUPPORTS_BOOLEAN_DATA_TYPE</code>
<attribute>Y</attribute>
</attribute>
<attribute>
<code>SUPPORTS_TIMESTAMP_DATA_TYPE</code>
<attribute>Y</attribute>
</attribute>
<attribute>
<code>USE_POOLING</code>
<attribute>N</attribute>
</attribute>
</attributes>
</connection>
<order>
<hop>
<from>表输入</from>
<to>表输出</to>
<enabled>Y</enabled>
</hop>
</order>
<step>
<name>表输入</name>
<type>TableInput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>db_config</connection>
<sql>SELECT * FROM ddd </sql>
<limit>0</limit>
<lookup/>
<execute_each_row>N</execute_each_row>
<variables_active>N</variables_active>
<lazy_conversion_active>N</lazy_conversion_active>
<cached_row_meta_active>N</cached_row_meta_active>
<row-meta>
<value-meta>
<type>Integer</type>
<storagetype>normal</storagetype>
<name>id</name>
<length>9</length>
<precision>0</precision>
<origin>表输入</origin>
<comments>id</comments>
<conversion_Mask>####0;-####0</conversion_Mask>
<decimal_symbol>.</decimal_symbol>
<grouping_symbol>,</grouping_symbol>
<currency_symbol/>
<trim_type>none</trim_type>
<case_insensitive>N</case_insensitive>
<collator_disabled>Y</collator_disabled>
<collator_strength>0</collator_strength>
<sort_descending>N</sort_descending>
<output_padding>N</output_padding>
<date_format_lenient>N</date_format_lenient>
<date_format_locale>zh_CN</date_format_locale>
<date_format_timezone>Asia/Shanghai</date_format_timezone>
<lenient_string_to_number>N</lenient_string_to_number>
</value-meta>
<value-meta>
<type>String</type>
<storagetype>normal</storagetype>
<name>NAME</name>
<length>10</length>
<precision>-1</precision>
<origin>表输入</origin>
<comments>NAME</comments>
<conversion_Mask/>
<decimal_symbol>.</decimal_symbol>
<grouping_symbol>,</grouping_symbol>
<currency_symbol/>
<trim_type>none</trim_type>
<case_insensitive>N</case_insensitive>
<collator_disabled>Y</collator_disabled>
<collator_strength>0</collator_strength>
<sort_descending>N</sort_descending>
<output_padding>N</output_padding>
<date_format_lenient>N</date_format_lenient>
<date_format_locale>zh_CN</date_format_locale>
<date_format_timezone>Asia/Shanghai</date_format_timezone>
<lenient_string_to_number>N</lenient_string_to_number>
</value-meta>
<value-meta>
<type>String</type>
<storagetype>normal</storagetype>
<name>gender</name>
<length>1</length>
<precision>-1</precision>
<origin>表输入</origin>
<comments>gender</comments>
<conversion_Mask/>
<decimal_symbol>.</decimal_symbol>
<grouping_symbol>,</grouping_symbol>
<currency_symbol/>
<trim_type>none</trim_type>
<case_insensitive>N</case_insensitive>
<collator_disabled>Y</collator_disabled>
<collator_strength>0</collator_strength>
<sort_descending>N</sort_descending>
<output_padding>N</output_padding>
<date_format_lenient>N</date_format_lenient>
<date_format_locale>zh_CN</date_format_locale>
<date_format_timezone>Asia/Shanghai</date_format_timezone>
<lenient_string_to_number>N</lenient_string_to_number>
</value-meta>
<value-meta>
<type>Number</type>
<storagetype>normal</storagetype>
<name>salary</name>
<length>-1</length>
<precision>-1</precision>
<origin>表输入</origin>
<comments>salary</comments>
<conversion_Mask>####0.0#########;-####0.0#########</conversion_Mask>
<decimal_symbol>.</decimal_symbol>
<grouping_symbol>,</grouping_symbol>
<currency_symbol/>
<trim_type>none</trim_type>
<case_insensitive>N</case_insensitive>
<collator_disabled>Y</collator_disabled>
<collator_strength>0</collator_strength>
<sort_descending>N</sort_descending>
<output_padding>N</output_padding>
<date_format_lenient>N</date_format_lenient>
<date_format_locale>zh_CN</date_format_locale>
<date_format_timezone>Asia/Shanghai</date_format_timezone>
<lenient_string_to_number>N</lenient_string_to_number>
</value-meta>
<value-meta>
<type>Date</type>
<storagetype>normal</storagetype>
<name>join_date</name>
<length>-1</length>
<precision>-1</precision>
<origin>表输入</origin>
<comments>join_date</comments>
<conversion_Mask/>
<decimal_symbol>.</decimal_symbol>
<grouping_symbol>,</grouping_symbol>
<currency_symbol/>
<trim_type>none</trim_type>
<case_insensitive>N</case_insensitive>
<collator_disabled>Y</collator_disabled>
<collator_strength>0</collator_strength>
<sort_descending>N</sort_descending>
<output_padding>N</output_padding>
<date_format_lenient>N</date_format_lenient>
<date_format_locale>zh_CN</date_format_locale>
<date_format_timezone>Asia/Shanghai</date_format_timezone>
<lenient_string_to_number>N</lenient_string_to_number>
</value-meta>
<value-meta>
<type>Integer</type>
<storagetype>normal</storagetype>
<name>dept_id</name>
<length>9</length>
<precision>0</precision>
<origin>表输入</origin>
<comments>dept_id</comments>
<conversion_Mask>####0;-####0</conversion_Mask>
<decimal_symbol>.</decimal_symbol>
<grouping_symbol>,</grouping_symbol>
<currency_symbol/>
<trim_type>none</trim_type>
<case_insensitive>N</case_insensitive>
<collator_disabled>Y</collator_disabled>
<collator_strength>0</collator_strength>
<sort_descending>N</sort_descending>
<output_padding>N</output_padding>
<date_format_lenient>N</date_format_lenient>
<date_format_locale>zh_CN</date_format_locale>
<date_format_timezone>Asia/Shanghai</date_format_timezone>
<lenient_string_to_number>N</lenient_string_to_number>
</value-meta>
</row-meta>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>272</xloc>
<yloc>144</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>表输出</name>
<type>TableOutput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>db_config</connection>
<schema/>
<table>ddd_copy1</table>
<commit>1000</commit>
<truncate>N</truncate>
<ignore_errors>N</ignore_errors>
<use_batch>Y</use_batch>
<specify_fields>Y</specify_fields>
<partitioning_enabled>N</partitioning_enabled>
<partitioning_field/>
<partitioning_daily>N</partitioning_daily>
<partitioning_monthly>Y</partitioning_monthly>
<tablename_in_field>N</tablename_in_field>
<tablename_field/>
<tablename_in_table>Y</tablename_in_table>
<return_keys>N</return_keys>
<return_field/>
<fields>
<field>
<column_name>id</column_name>
<stream_name>id</stream_name>
</field>
<field>
<column_name>NAME</column_name>
<stream_name>NAME</stream_name>
</field>
<field>
<column_name>gender</column_name>
<stream_name>gender</stream_name>
</field>
<field>
<column_name>salary</column_name>
<stream_name>salary</stream_name>
</field>
<field>
<column_name>join_date</column_name>
<stream_name>join_date</stream_name>
</field>
<field>
<column_name>dept_id</column_name>
<stream_name>dept_id</stream_name>
</field>
</fields>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>560</xloc>
<yloc>160</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling>
</step_error_handling>
<slave-step-copy-partition-distribution>
</slave-step-copy-partition-distribution>
<slave_transformation>N</slave_transformation>
<attributes/>
</transformation>
测试
import org.pentaho.di.core.KettleEnvironment;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
public class Test {
/**
* 上述操作将会产生一个ktr文件,接下来的操作是对ktr文件进行转换:
*/
public static void main(String[] args) throws KettleException {
//初始化ketlle
KettleEnvironment.init();
//创建转换元数据对象
TransMeta meta = new TransMeta("etl/test.ktr");
Trans trans = new Trans(meta);
trans.prepareExecution(null);
trans.startThreads();
trans.waitUntilFinished();
if(trans.getErrors()==0){
System.out.println("执行成功!");
}
}
}
注意
由于大部分kettle资源都是存在国外服务器。所以导入依赖时可能不成功,注意切换外网。如在使用过程中有任何问题,请留言。