#将kafka传送到hdfs中
整合consumer+hdfs
任务:
在consumerDemo这个类中:
1、把flume采集到的数据保存到临时文件中
2、间隔半小时,把临时文件上传到hdfs上
思路
首先 consumer
从kafka中读到了数据放在records中
1.java代码从kafka获取数据
poll方法
KafkaConsumer.poll
用输入流将records中的文件传入到 新建文件夹 temp201910151440.log
然后 m1 将本地文件传入到 hdfs 中
然后再添加一个定时器
consumer
package com.zpark.kafkatest.two;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collections;
import java.util.Properties;
public class consumerDemo {
private static KafkaConsumer<String, String> consumer;
private static Properties props;
static {
props = new Properties();
//消费者kafkka地址
props.put("bootstrap.servers", "hdp-2:9092");
//key反序列化
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
//组
props.put("group.id", "yangk");
}
/**
* 从kafka中获取数据(SpringBoot也集成了kafka)
*/
private static void ConsumerMessage() {
//允许自动提交位移
props.put("enable.auto.commit", true);
consumer = new KafkaConsumer<String, String>(props);
consumer.subscribe(Collections.singleton("test"));
String path = "D://KJ/hadoopKJ/shixunhadoop/log";
File file = new File(path);
file.mkdir();
//使用轮询拉取数据--消费完成之后会根据设置时长来清除消息,被消费过的消息,如果想再次被消费,可以根据偏移量(offset)来获取
try {
FileWriter fw = new FileWriter("D:/KJ/hadoopKJ/shixunhadoop/log/log.log");
while (true) {
/*从kafka中读到了数据放在records中
1.java代码从kafka获取数据
poll方法
KafkaConsumer.poll
*/
ConsumerRecords<String, String> records = consumer.poll(100);
for (ConsumerRecord<String, String> r : records) {
System.out.println(r.toString());
fw.write(r.toString());
fw.flush();
System.out.printf("topic = %s, offset = %s, key = %s, value = %s", r.topic(), r.offset(),
r.key(), r.value());
}
}
}
catch (IOException e) {
e.printStackTrace();
}
finally {
consumer.close();
}
}
public static void main(String[] args) {
ConsumerMessage();
}
}
从本地下沉到hdfs 并定时查询
package com.zpark.kafkatest.two;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.TimerTask;
public class HdfsTest extends TimerTask {
public static void main(String[] args) {
URI uri =null;
Configuration conf =null;
String user ="root";
FileSystem fs =null;
try {
uri = new URI("hdfs://hdp-1:9000");
conf = new Configuration();
//dfs.replication:分布式文件系统副本的数量
conf.set("dfs.replication","2");
//dfs.blocksize:分布式文件系统的块的大小
conf.set("dfs.blocksize","64m");
fs = FileSystem.get(uri, conf, user);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
}
Path src = new Path("D://KJ/hadoopKJ/shixunhadoop/log/log.log");
Path dst = new Path("/logs.log");
try {
fs.copyFromLocalFile(src,dst);
} catch (IOException e) {
e.printStackTrace();
}
}
public void run() {
}
}
time
定时器
package com.zpark.kafkatest.two;
import java.util.Timer;
public class Time {
public static void main(String[] args) {
Timer timer = new Timer();
timer.schedule(new HdfsTest(),0,60*60*1000L);
}
}