GIT项目
项目一:https://github.com/michal-harish/kafka-hadoop-loader (本文使用此项目)
项目二:https://github.com/JavaNikon/kafka-hadoop-loader
【配置maven】
# whereis maven
# vi settings.xml
<mirror>
<id>nexus</id>
<mirrorOf>central</mirrorOf>
<name>internal nexus repository</name>
<url>https://10.41.103.97/artifactory/public-maven-virtual/</url>
</mirror>
</mirrors>
【解压编译】【主从节点都可以】
# cd /usr/local
# tar -xzvf kafka-hadoop-loader-master.tar.gz
【关闭压缩】(编译前修改代码中输出数据格式,默认压缩,可关闭)
# cd kafka-hadoop-loader
# vi src/main/java/io/amient/kafka/hadoop/io/MultiOutputFormat.java
if (isCompressed) {
rw = new LineRecordWriter(fileOut);
} else {
rw = new LineRecordWriter(fileOut);
}
【编译打包】
# mvn package
# cd target
【执行命令】
hadoop jar kafka-hadoop-loader.jar -z 10.47.85.214:2181,10.47.85.213:2181 -t 12345 hdfs://10.47.85.215:9000/flume
【数据归置】
# hdfs dfs -ls -R /flume/12345 |grep "^\-rw"|awk '{print $NF}'
/flume/12345/1/12345-1-0000000000000000000
/flume/12345/2/12345-2-0000000000000000000
/flume/12345/3/12345-3-0000000000000000000
/flume/12345/4/12345-4-0000000000000000000
/flume/12345/5/12345-5-0000000000000000000
/flume/12345/6/12345-6-0000000000000000000
/flume/12345/7/12345-7-0000000000000000000
/flume/12345/8/12345-8-0000000000000000000
/flume/12345/9/12345-9-0000000000000000000
#结果文件迁移根目录
for hdfsfile in $(hdfs dfs -ls -R /TestSql/output/1/output.txt.part* |grep "^\-rw"|awk '{print $NF}');do
hdfs dfs -mv ${hdfsfile} /$(echo $line|awk -F '/' '{print $NF}')
done
for hdfsfolder in $(hdfs dfs -ls /TestSql|grep "^\drwx"|awk '{print $NF}');do
hdfs dfs -rm -R ${hdfsfolder}
done
编译前修改代码中输出数据格式,默认压缩,可关闭。
vi src/main/java/io/amient/kafka/hadoop/io/MultiOutputFormat.java
if (isCompressed) {
rw = new LineRecordWriter(fileOut);
} else {
rw = new LineRecordWriter(fileOut);
}