方法一:自己写程序,从kafka里面消费,写到hdfs上面,但是注意idea工具maven kafka的依赖版本要和虚拟机里的kafka版本对上;
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.8.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.8.2.2</version>
</dependency>
package com.atguigu.offline;
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import kafka.message.MessageAndMetadata;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* @author wade
* @create 2019-03-21 10:32
*/
public class MyKafkaConsumer {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(new URI("hdfs://hadoop103:9000"),conf,"atguigu");
Properties prop = new Properties();
prop.load(ClassLoader.getSystemResourceAsStream("config.properties"));
ConsumerConfig config = new ConsumerConfig(prop);
ConsumerConnector javaConsumerConnector = Consumer.createJavaConsumerConnector(config);
Map<String, Integer> map = new HashMap<>();
map.put(prop.getProperty("kafka.topic"),1);
Map<String, List<KafkaStream<byte[], byte[]>>> messageStreams = javaConsumerConnector.createMessageStreams(map);
List<KafkaStream<byte[], byte[]>> kafkaStreams = messageStreams.get(prop.getProperty("kafka.topic"));
ConsumerIterator<byte[], byte[]> iterator = kafkaStreams.get(0).iterator();
FSDataOutputStream fsDataStream ;
//获取当前时间
long lastTime = System.currentTimeMillis();
Path descPath = getPath(lastTime);
fsDataStream = getOutputStream(fs,descPath);
while (iterator.hasNext()){
if (System.currentTimeMillis()-lastTime >60000){
fsDataStream.hflush();
fsDataStream.close();
lastTime = System.currentTimeMillis();
int minutes = new Date(lastTime).getMinutes();
System.out.println(minutes+"分钟数据已经写入,开始写入下一分钟数据");
fsDataStream = getOutputStream(fs,getPath(lastTime));
}
MessageAndMetadata<byte[], byte[]> metadata = iterator.next();
byte[] message = metadata.message();
//写的时候注意 \n
fsDataStream.write((new String(message)+"\n").getBytes());
fsDataStream.hflush();
fsDataStream.hsync();
}
}
public static FSDataOutputStream getOutputStream(FileSystem fs,Path descPath){
try {
if (fs.exists(descPath)) {
//如果存在 追加
return fs.append(descPath);
}else {
return fs.create(descPath);
}
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public static String getDirFromString(String dateString){
return "/"+dateString.split(" ")[0];
}
public static String getFileNameFromString(String dateString){
return dateString.split(" ")[1];
}
public static Path getPath (long lastTime){
SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HHmm");
//根据时间形成文件夹 /year/month/day 文件名 mmss
String dateString = sdf.format(new Date(lastTime));
//根据时间形成文件夹
String dirName = getDirFromString(dateString);
//根据时间形成文件名
String fileName = getFileNameFromString(dateString);
//判断文件是否存在
Path descPath = new Path(dirName + "/" + fileName);
return descPath;
}
}
方法二:使用flume的kafkasource 和 hdfs sink
1.6版本flume
a3.sources = r1
a3.channels = c1
a3.sinks = k1
a3.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a3.sources.r1.zookeeperConnect = hadoop103:2181,hadoop104:2181,hadoop105:2181
a3.sources.r1.groupId = g2
a3.sources.r1.topic = log-analysis
a3.channels.c1.type = file
a3.channels.c1.checkpointDir = /opt/module/flume-1.6/ck3/behavior_collect
a3.channels.c1.dataDirs = /opt/module/flume-1.6/data3/behavior_collect
a3.sinks.k1.type = hdfs
a3.sinks.k1.hdfs.path = hdfs://hadoop103:9000/%Y/%m/%d
a3.sinks.k1.hdfs.filePrefix = %H%M
a3.sinks.k1.hdfs.rollInterval = 60
#不按时间个数滚动
a3.sinks.k1.hdfs.rollCount = 0
#不按文件大小滚动
a3.sinks.k1.hdfs.rollSize = 0
#没有这个上面的策略都是失效的,配置后重启
#这个属性的目的就是相当于时间戳的拦截器,否则%Y 等等这些东西都识别不了
#http://www.aboutyun.com/thread-21365-1-1.html
a3.sinks.k1.hdfs.minBlockReplicas=1
a3.sinks.k1.hdfs.useLocalTimeStamp=true
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
a3.sinks.k1.hdfs.minBlockReplicas=1
这个参数一定要加上,不加上面的滚动设置都没有作用;