flume采集部分
部署flume客户端监控应用程序产生的日志信息,并发送到kafka集群中
test.channels = c1
test.sinks = k1
test.sources.s1.type = spooldir
test.sources.s1.spoolDir = /opt/kb07file/flumeFile/test
test.sources.s1.deserializer = LINE
test.sources.s1.deserializer.maxLineLength = 60000
test.sources.s1.includePattern = test_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
test.channels.c1.type = file
test.channels.c1.checkpointDir = /opt/kb07file/flumeFile/checkpoint/test
test.channels.c1.dataDir = /opt/kb07file/flumeFile/data/test
test.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
test.sinks.k1.batchSize = 640
test.sinks.k1.brokerList = 192.168.174.41:9092
test.sinks.k1.topic = test
test.sources.s1.channels =c1
test.sinks.k1.channel = c1
kafka清洗过程
public class UserFrienf {
public static void main(String[] args) {
//Properties类该类主要用于读取Java的配置文件
Properties prop=new Properties();
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.174.41:9092");
prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "kb07");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG,Serdes.String().getClass());
StreamsBuilder builder = new StreamsBuilder();
builder.stream("user_friends_raw")
.filter((k, v) -> (!v.toString().startsWith("user,") && v.toString().split(",").length == 2))
.flatMap((k, v) -> {//1, 2 3 4 5 6 7
System.out.println(k + " " + v); //1 2 3 4 5 6 7
List<KeyValue<String, String>> keyValues = new ArrayList<>();
String[] split = v.toString().split(","); // [1, 2 3 4 5 6 7]
String userId = split[0]; // 1
String[] friends = split[1].split(" "); //[2, 3, 4, 5, 6, 7]
for (String friend : friends) {
KeyValue<String, String> keyValue = new KeyValue<>(null, userId + " " + friend);
keyValues.add(keyValue);
}
return keyValues;
}).to("user_friends");
Topology topo = builder.build();
KafkaStreams streams = new KafkaStreams(topo, prop);
CountDownLatch countDownLatch = new CountDownLatch(1);
Runtime.getRuntime().addShutdownHook(new Thread("kb07"){
@Override
public void run() {
streams.close();
countDownLatch.countDown();
}
});
streams.start();
try {
countDownLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
System.exit(0);
}
}
放入Hbase保存数据
public class UserFriendhb {
public static void main(String[] args) {
//hbase
Properties prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.174.41:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000");
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
prop.put(ConsumerConfig.GROUP_ID_CONFIG, "aaa");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
consumer.subscribe(Collections.singletonList("user_friends5"));
//hbase
Configuration config = HBaseConfiguration.create();
config.set("hbase.rootdir", "hdfs://192.168.174.41:9000/hbase");
config.set("hbase.zookeeper.quorum", "192.168.174.41");
config.set("hbase.zookeeper.property.clientPort", "2181");
try {
Connection connection = ConnectionFactory.createConnection(config);
Table table = connection.getTable(TableName.valueOf("b:user_friend"));
while (true) {
ConsumerRecords<String, String> records = consumer.poll(100);
//list一定要在while里面
List<Put> putList = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
System.out.println(record);
String[] infos = record.value().split(" ");
Put put = new Put(Bytes.toBytes((infos[0] + infos[1]).hashCode()));//key值
put.addColumn("uf".getBytes(), "userid".getBytes(), infos[0].getBytes());
put.addColumn("uf".getBytes(), "friendid".getBytes(), infos[1].getBytes());
putList.add(put);
System.out.println("--------------");
}
table.put(putList);
table.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
将Hbase表映射到Hive中
create external table hb_user(
row_key string,
locale string,
birth_year string,
gender string,
joinedAt string,
location string,
timezone string)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, region:locale,profile:birth_year,profile:gender,registration:joinedAt,region:location,region:timezone')
tblproperties ('hbase.table.name' = 'events_db:users');
create table eventskb07.user
stored as orc as
select * from eventskb07.hb_event_attendee;