准备工作:
准备文件
event_attendees.csv
events.csv
user_friends.csv
users.csv
train.csv
存放目录
/opt/flume160/conf/jobkb09/tmp/
编写flume文件:
conf文件存放路径:/opt/flume160/conf/jobkb09/flume2Kafka
查看几个csv文件的头信息:
[root@hadoop100 tmp]# head -1 event_attendees.csv
event,yes,maybe,invited,no
[root@hadoop100 tmp]# head -1 events.csv
event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,c_17,c_18,c_19,c_20,c_21,c_22,c_23,c_24,c_25,c_26,c_27,c_28,c_29,c_30,c_31,c_32,c_33,c_34,c_35,c_36,c_37,c_38,c_39,c_40,c_41,c_42,c_43,c_44,c_45,c_46,c_47,c_48,c_49,c_50,c_51,c_52,c_53,c_54,c_55,c_56,c_57,c_58,c_59,c_60,c_61,c_62,c_63,c_64,c_65,c_66,c_67,c_68,c_69,c_70,c_71,c_72,c_73,c_74,c_75,c_76,c_77,c_78,c_79,c_80,c_81,c_82,c_83,c_84,c_85,c_86,c_87,c_88,c_89,c_90,c_91,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
[root@hadoop100 tmp]# head -1 train.csv
user,event,invited,timestamp,interested,not_interested
[root@hadoop100 tmp]# head -1 user_friends.csv
user,friends
[root@hadoop100 tmp]# head -1 users.csv
user_id,locale,birthyear,gender,joinedAt,location,timezone
创建source文件目录
mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/eventAttendees
mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/events
mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/train
mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/userFriends
mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/users
执行flume-ng的命令:
/opt/flume160/bin/flume-ng agent -n eventAttendees -c /opt/flume160/conf/ -f /opt/flume160/conf/jobkb09/flume2Kafka/eventAttendees-flume-kafka.conf -Dflume.root.logger=INFO,console
将csv文件移动到指定路径:
cp /opt/flume160/conf/jobkb09/tmp/event_attendees.csv /opt/flume160/conf/jobkb09/dataSourceFile/eventAttendees/eventAttendees_2021-01-13.csv
cp /opt/flume160/conf/jobkb09/tmp/events.csv /opt/flume160/conf/jobkb09/dataSourceFile/events/events_2021-01-13.csv
cp /opt/flume160/conf/jobkb09/tmp/train.csv /opt/flume160/conf/jobkb09/dataSourceFile/train/train_2021-01-13.csv
cp /opt/flume160/conf/jobkb09/tmp/user_friends.csv /opt/flume160/conf/jobkb09/dataSourceFile/userFriends/userFriends_2021-01-13.csv
cp /opt/flume160/conf/jobkb09/tmp/users.csv /opt/flume160/conf/jobkb09/dataSourceFile/users/users_2021-01-13.csv
启动Kafka的命令:
kafka-server-start.sh /opt/server.properties 2>& 1>>/var/kafka.log &
查看Kafka中的list
kafka-topics.sh --zookeeper 192.168.237.100:2181/kafka --list
kafka中删除topic的命令:
kafka-topics.sh --delete --zookeeper 192.168.237.100:2181/kafka --topic event_attendees_raw
查看event_attendees_raw的offset:(注意:这里 -time 的参数是-1)
kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list 192.168.237.100:9092 --topic event_attendees_raw -time -1 --offsets 1
# 进入flume目录下
cd /opt/flume160
# 执行flume-agent程序
./bin/flume-ng agent -n userFriends -c ./conf/ -f ./conf/jobkb09/flume2Kafka/userFriends-flume-kafka.conf -Dflume.root.logger=INFO,console
./bin/flume-ng agent -n eventAttendees -c ./conf/ -f ./conf/jobkb09/flume2Kafka/eventAttendees-flume-kafka.conf -Dflume.root.logger=INFO,console
./bin/flume-ng agent -n events -c ./conf/ -f ./conf/jobkb09/flume2Kafka/events-flume-kafka.conf -Dflume.root.logger=INFO,console
./bin/flume-ng agent -n train -c ./conf/ -f ./conf/jobkb09/flume2Kafka/train-flume-kafka.conf -Dflume.root.logger=INFO,console
./bin/flume-ng agent -n users -c ./conf/ -f ./conf/jobkb09/flume2Kafka/users-flume-kafka.conf -Dflume.root.logger=INFO,console
发生报错,但是数据可以写入:(标注错误)
Error while fetching metadata with correlation id 1 : {event_attendees_raw=LEADER_NOT_AVAILABLE}
创建hbase表:
create 'events_db:users','profile','region','registration'
create 'events_db:user_friend','uf'
create 'events_db:events','schedule','location','creator','remark'
create 'events_db:event_attendee','euat'
create 'events_db:event_train','eu'
----------------------eventAttendees-flume-kafka.conf-----------------------------
eventAttendees.sources=eventAttendeesSource
eventAttendees.channels=eventAttendeesChannel
eventAttendees.sinks=eventAttendeesSink
eventAttendees.sources.eventAttendeesSource.type=spooldir
eventAttendees.sources.eventAttendeesSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/eventAttendees
eventAttendees.sources.eventAttendeesSource.includePattern=eventAttendees_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
eventAttendees.sources.eventAttendeesSource.deserializer=LINE
eventAttendees.sources.eventAttendeesSource.deserializer.maxLineLength=200000
eventAttendees.sources.eventAttendeesSource.interceptors=head_filter
eventAttendees.sources.eventAttendeesSource.interceptors.head_filter.type=regex_filter
eventAttendees.sources.eventAttendeesSource.interceptors.head_filter.regex=^event*
eventAttendees.sources.eventAttendeesSource.interceptors.head_filter.excludeEvents=true
eventAttendees.channels.eventAttendeesChannel.type=file
eventAttendees.channels.eventAttendeesChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/eventAttendees
eventAttendees.channels.eventAttendeesChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/eventAttendees
eventAttendees.sinks.eventAttendeesSink.type=org.apache.flume.sink.kafka.KafkaSink
eventAttendees.sinks.eventAttendeesSink.batchSize=640
eventAttendees.sinks.eventAttendeesSink.brokerList=192.168.237.100:9092
eventAttendees.sinks.eventAttendeesSink.topic=event_attendees_raw
eventAttendees.sources.eventAttendeesSource.channels=eventAttendeesChannel
eventAttendees.sinks.eventAttendeesSink.channel=eventAttendeesChannel
----------------------events-flume-kafka.conf-----------------------------
events.sources=eventsSource
events.channels=eventsChannel
events.sinks=eventsSink
events.sources.eventsSource.type=spooldir
events.sources.eventsSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/events
# 记得修改source文件名
events.sources.eventsSource.includePattern=events_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
events.sources.eventsSource.deserializer=LINE
events.sources.eventsSource.deserializer.maxLineLength=200000
events.sources.eventsSource.interceptors=head_filter
events.sources.eventsSource.interceptors.head_filter.type=regex_filter
# 记得修改正则表达式匹配
events.sources.eventsSource.interceptors.head_filter.regex=^event_id*
events.sources.eventsSource.interceptors.head_filter.excludeEvents=true
events.channels.eventsChannel.type=file
events.channels.eventsChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/events
events.channels.eventsChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/events
events.sinks.eventsSink.type=org.apache.flume.sink.kafka.KafkaSink
events.sinks.eventsSink.batchSize=640
events.sinks.eventsSink.brokerList=192.168.237.100:9092
# 记得修改写入topic的名字
events.sinks.eventsSink.topic=events_raw
events.sources.eventsSource.channels=eventsChannel
events.sinks.eventsSink.channel=eventsChannel
----------------------userFriends-flume-kafka.conf-----------------------------
userFriends.sources=userFriendsSource
userFriends.channels=userFriendsChannel
userFriends.sinks=userFriendsSink
userFriends.sources.userFriendsSource.type=spooldir
userFriends.sources.userFriendsSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/userFriends
# 记得修改source文件名
userFriends.sources.userFriendsSource.includePattern=userFriends_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
userFriends.sources.userFriendsSource.deserializer=LINE
userFriends.sources.userFriendsSource.deserializer.maxLineLength=200000
userFriends.sources.userFriendsSource.interceptors=head_filter
userFriends.sources.userFriendsSource.interceptors.head_filter.type=regex_filter
# 记得修改正则表达式匹配
userFriends.sources.userFriendsSource.interceptors.head_filter.regex=^user*
userFriends.sources.userFriendsSource.interceptors.head_filter.excludeEvents=true
userFriends.channels.userFriendsChannel.type=file
userFriends.channels.userFriendsChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/userFriends
userFriends.channels.userFriendsChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/userFriends
userFriends.sinks.userFriendsSink.type=org.apache.flume.sink.kafka.KafkaSink
userFriends.sinks.userFriendsSink.batchSize=640
userFriends.sinks.userFriendsSink.brokerList=192.168.237.100:9092
# 记得修改写入topic的名字
userFriends.sinks.userFriendsSink.topic=user_friends_raw
userFriends.sources.userFriendsSource.channels=userFriendsChannel
userFriends.sinks.userFriendsSink.channel=userFriendsChannel
----------------------users-flume-kafka.conf-----------------------------
users.sources=usersSource
users.channels=usersChannel
users.sinks=usersSink
users.sources.usersSource.type=spooldir
users.sources.usersSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/users
# 记得修改source文件名
users.sources.usersSource.includePattern=users_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
users.sources.usersSource.deserializer=LINE
users.sources.usersSource.deserializer.maxLineLength=200000
users.sources.usersSource.interceptors=head_filter
users.sources.usersSource.interceptors.head_filter.type=regex_filter
# 记得修改正则表达式匹配
users.sources.usersSource.interceptors.head_filter.regex=^user*
users.sources.usersSource.interceptors.head_filter.excludeEvents=true
users.channels.usersChannel.type=file
users.channels.usersChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/users
users.channels.usersChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/users
users.sinks.usersSink.type=org.apache.flume.sink.kafka.KafkaSink
users.sinks.usersSink.batchSize=640
users.sinks.usersSink.brokerList=192.168.237.100:9092
# 记得修改写入topic的名字
users.sinks.usersSink.topic=users_raw
users.sources.usersSource.channels=usersChannel
users.sinks.usersSink.channel=usersChannel
-------------------------------train-flume-kafka.conf------------------------------
train.sources=trainSource
train.channels=trainChannel
train.sinks=trainSink
train.sources.trainSource.type=spooldir
train.sources.trainSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/train
# 记得修改source文件名
train.sources.trainSource.includePattern=train_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
train.sources.trainSource.deserializer=LINE
train.sources.trainSource.deserializer.maxLineLength=200000
train.sources.trainSource.interceptors=head_filter
train.sources.trainSource.interceptors.head_filter.type=regex_filter
# 记得修改正则表达式匹配
train.sources.trainSource.interceptors.head_filter.regex=^user*
train.sources.trainSource.interceptors.head_filter.excludeEvents=true
train.channels.trainChannel.type=file
train.channels.trainChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/train
train.channels.trainChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/train
train.sinks.trainSink.type=org.apache.flume.sink.kafka.KafkaSink
train.sinks.trainSink.batchSize=640
train.sinks.trainSink.brokerList=192.168.237.100:9092
# 记得修改写入topic的名字
train.sinks.trainSink.topic=train_raw
train.sources.trainSource.channels=trainChannel
train.sinks.trainSink.channel=trainChannel
---------------------------------以上5个文件都导入到Kafka中了----------------------
从Kafka导入到hbase:
hbase(main):004:0> count 'events_db:events' , INTERVAL => 100000
第一步:使用flume将csv文件的内容写入Kafka
编写flume的conf文件:
userFriends.sources=userFriendsSource
userFriends.channels=userFriendsChannel
userFriends.sinks=userFriendsSink
userFriends.sources.userFriendsSource.type=spooldir
userFriends.sources.userFriendsSource.spoolDir=/opt/flume160/conf/jobkb09/dataSourceFile/userFriends
userFriends.sources.userFriendsSource.includePattern=userFriends_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
userFriends.sources.userFriendsSource.deserializer=LINE
userFriends.sources.userFriendsSource.deserializer.maxLineLength=100000
# interceptors 拦截器 不要拼写错误
userFriends.sources.userFriendsSource.interceptors=head_filter
userFriends.sources.userFriendsSource.interceptors.head_filter.type=regex_filter
userFriends.sources.userFriendsSource.interceptors.head_filter.regex=^user*
# 在使用 Regex Filtering Interceptor的时候一个属性是 excludeEvents
# 当它的值为 true 的时候,过滤掉匹配到当前正则表达式的一行
# 当它的值为 false 的时候,就接受匹配到正则表达式的一行
userFriends.sources.userFriendsSource.interceptors.head_filter.excludeEvents=true
userFriends.channels.userFriendsChannel.type=file
userFriends.channels.userFriendsChannel.checkpointDir=/opt/flume160/conf/jobkb09/checkPointFile/userFriends
userFriends.channels.userFriendsChannel.dataDirs=/opt/flume160/conf/jobkb09/dataChannelFile/userFriends
userFriends.sinks.userFriendsSink.type=org.apache.flume.sink.kafka.KafkaSink
userFriends.sinks.userFriendsSink.batchSize=640
userFriends.sinks.userFriendsSink.brokerList=192.168.237.100:9092
# 指定写入 Kafka 的 topic
userFriends.sinks.userFriendsSink.topic=user_friends_raw
userFriends.sources.userFriendsSource.channels=userFriendsChannel
# 一个sink对应一个channel使用单数
userFriends.sinks.userFriendsSink.channel=userFriendsChannel
测试:
# 启动zookeeper
zkServer.sh
# 启动Kafka服务
/opt/kafka211/bin/kafka-server-start.sh /opt/server.properties 2>&1 >> /var/kafka.log &
# 查看Kafka中的topic
kafka-topics.sh --zookeeper 192.168.237.100:2181/kafka --list
# 先删除Kafka中的 user_friends_raw topic
kafka-topics.sh --delete --zookeeper 192.168.237.100:2181/kafka --topic user_friends_raw
# 创建资源dataSourceFile目标文件夹
mkdir -p /opt/flume160/conf/jobkb09/dataSourceFile/userFriends
# 将文件按照指定格式放入
cp /opt/flume160/conf/jobkb09/tmp/user_friends.csv /opt/flume160/conf/jobkb09/dataSourceFile/userFriends
cd /opt/flume160/conf/jobkb09/dataSourceFile/userFriends
mv user_friends.csv userFriends_2020-01-12.csv
# 进入flume目录下
cd /opt/flume160
# 执行flume-agent程序
./bin/flume-ng agent -n userFriends -c ./conf/ -f ./conf/jobkb09/flume2Kafka/userFriends-flume-kafka.conf -Dflume.root.logger=INFO,console
# 查看topic的offset
kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list 192.168.237.100:9092 --topic user_friends_raw -time -1 --offsets 1
在HBase中建表,定义列族:
create_namespace 'events_db'
create 'events_db:users','profile','region','registration'
create 'events_db:user_friend','uf'
create 'events_db:events','schedule','location','creator','remark'
create 'events_db:event_attendee','euat'
create 'events_db:event_train','eu'
第二步:写Java代码将Kafka中的数据消费到Hbase
public class UserFriendTohb {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.237.100:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);
prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000);
prop.put(ConsumerConfig.GROUP_ID_CONFIG,"userFriend");
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
consumer.subscribe(Collections.singleton("user_friends"));
// 配置hbase信息 连接bhase数据库
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir","hdfs://192.168.237.100:9000/hbase");
conf.set("hbase.zookeeper.quorum","192.168.237.100");
conf.set("hbase.zookeeper.property.clientPort","2181");
try {
Connection connection = ConnectionFactory.createConnection(conf);
Table hbaseTable = connection.getTable(TableName.valueOf("events_db:user_friend"));
while(true) {
ConsumerRecords<String, String> poll = consumer.poll(Duration.ofMillis(100));
ArrayList<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> p : poll) {
// System.out.println(p.value());
String[] split = p.value().split(",");
Put put = new Put(Bytes.toBytes((split[0] + split[1] ).hashCode()));
put.addColumn("uf".getBytes(), "userid".getBytes(), split[0].getBytes());
put.addColumn("uf".getBytes(), "friendid".getBytes(), split[1].getBytes());
datas.add(put);
}
hbaseTable.put(datas);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
启动HBase:
# 启动hadoop集群
start-all.sh
# 启动hbase
start-hbase.sh
# 建表
create_namespace 'events_db'
create 'events_db:users','profile','region','registration'
create 'events_db:user_friend','uf'
create 'events_db:events','schedule','location','creator','remark'
create 'events_db:event_attendee','euat'
create 'events_db:event_train','eu'
# 运行Java代码
# 查看写入的row key数量
count 'events_db:user_friend' ,INTERVAL => 10000
代码复用性太低,重复代码太多,需要面向接口编程,开始抽象提取:
首先定义IWriter接口:
public interface IWriter {
int write(ConsumerRecords<String, String> records,String tableName) throws IOException;
}
只需要传入要写的表名,以及要写的消费者数据,就可以完成写入操作
具体实现,定义写入hbase:
public class HBaseWriter implements IWriter {
private Connection connection;
// 定义构造方法,获得connection
public HBaseWriter() {
// 配置hbase信息 连接bhase数据库
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir","hdfs://192.168.237.100:9000/hbase");
conf.set("hbase.zookeeper.quorum","192.168.237.100");
conf.set("hbase.zookeeper.property.clientPort","2181");
try {
connection = ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
// 重写write方法,具体实现写入hbase
@Override
public int write(ConsumerRecords<String, String> records, String tableName ) throws IOException {
Table eventAttendeeTable = connection.getTable(TableName.valueOf(tableName)); //"events_db:user_friend"
List<Put> datas = parseRecord.parse(records);
for (ConsumerRecord<String, String> p : records) {
//System.out.println(p.value());
String[] split = p.value().split(",");
Put put = new Put(Bytes.toBytes((split[0] + split[1] ).hashCode()));
put.addColumn("uf".getBytes(), "userid".getBytes(), split[0].getBytes());
put.addColumn("uf".getBytes(), "friendid".getBytes(), split[1].getBytes());
datas.add(put);
}
eventAttendeeTable.put(datas);
return datas.size();
}
}
此时的main方法:
public class UserFriendTohb2 {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.237.100:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);
prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000);
prop.put(ConsumerConfig.GROUP_ID_CONFIG,"userFriend");
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
consumer.subscribe(Collections.singleton("user_friends"));
IWriter iWriter= new HBaseWriter();
try {
System.out.println("--------------userFriend to hbase ---------");
while(true) {
ConsumerRecords<String, String> poll = consumer.poll(Duration.ofMillis(100));
int write = iWriter.write(poll, "events_db:user_friend");
System.out.println("写入行数: "+write);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
这是代码已经简洁了一些,但此时可以看到write方法中的实现逻辑还是写死的,当想写入其他表中的时候要重写整个方法并没有起到想要的效果,这时候应该把for循环中的业务逻辑也抽取出来,定义一个新的接口,传入的参数是ConsumerRecords<String,String>返回的结果是 List< Put > :
// 将Kafka消费的信息,通过加工转换,得到List<Put>对象,用于hbase存储使用
public interface IParseRecord {
List<Put> parse (ConsumerRecords<String,String> records);
}
定义具体的实现:
public class UserFriendsHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> p : records) {
//System.out.println(p.value());
String[] split = p.value().split(",");
if(split.length==2) {
Put put = new Put(Bytes.toBytes((split[0] + split[1]).hashCode()));
put.addColumn("uf".getBytes(), "userid".getBytes(), split[0].getBytes());
put.addColumn("uf".getBytes(), "friendid".getBytes(), split[1].getBytes());
datas.add(put);
}
}
return datas;
}
}
改写HBaseWriter:
public class HBaseWriter implements IWriter {
private Connection connection;
private IParseRecord parseRecord;
// 如何获取UserFriendsHandler可以通过get/set方法或者通过构造方法传进来
/* public IParseRecord getParseRecord() { return parseRecord; }
public void setParseRecord(IParseRecord parseRecord) { this.parseRecord = parseRecord; } */
public HBaseWriter(IParseRecord parseRecord) {
this.parseRecord = parseRecord;
// 配置hbase信息 连接bhase数据库
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir","hdfs://192.168.237.100:9000/hbase");
conf.set("hbase.zookeeper.quorum","192.168.237.100");
conf.set("hbase.zookeeper.property.clientPort","2181");
try {
connection = ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public int write(ConsumerRecords<String, String> records, String tableName ) throws IOException {
Table hbaseTable = connection.getTable(TableName.valueOf(tableName)); //"events_db:user_friend"
List<Put> datas = parseRecord.parse(records);
hbaseTable.put(datas);
return datas.size();
}
}
这时候可以把整个main方法抽取出去,通过fillDatas方法完成实现逻辑
定义IWorker接口:
public interface IWorker {
void fillData();
}
实现HBaseWorker:
public class HbaseWorker implements IWorker {
private IWriter writer; //= new HBaseWriter(record);
private String topic;
private Properties prop;
private String target;
public HbaseWorker(String topic,String targetTable,IWriter writer) {
this("defaultGroup",topic,targetTable,writer);
}
public HbaseWorker(String groupName,String topic,String targetTable,IWriter writer) {
prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.237.100:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);
prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000);
prop.put(ConsumerConfig.GROUP_ID_CONFIG,groupName);
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
this.topic = topic;
this.target = targetTable;
this.writer = writer;
}
@Override
public void fillData() {
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
consumer.subscribe(Collections.singleton(topic));
try {
System.out.println("--------------userFriend to hbase ---------");
while(true) {
ConsumerRecords<String, String> poll = consumer.poll(Duration.ofMillis(100));
int write = writer.write(poll, target);
System.out.println("写入行数: "+write);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
这时候的main方法:
public class UserFriendTohb2 {
public static void main(String[] args) {
HbaseWorker hbaseWorker = new HbaseWorker(
"userfriends",
"user_friends_raw",
"events_db:user_friend",
new HBaseWriter(new UserFriendsHandler())
);
hbaseWorker.fillData();
}
}
HBaseWorker还可以继续抽象父类:
public abstract class ParentWorker implements IWorker {
protected Properties prop;
public ParentWorker(String groupName){
prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.237.100:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);
prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000);
prop.put(ConsumerConfig.GROUP_ID_CONFIG,groupName);
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
}
}
此时的HBaseWorker:
public class HbaseWorker extends ParentWorker {
private IWriter writer; //= new HBaseWriter(record);
private String topic;
private String target;
public HbaseWorker(String topic,String targetTable,IWriter writer) {
this("defaultGroup",topic,targetTable,writer);
}
public HbaseWorker(String groupName,String topic,String targetTable,IWriter writer) {
super(groupName);
this.topic = topic;
this.target = targetTable;
this.writer = writer;
}
@Override
public void fillData() {
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
consumer.subscribe(Collections.singleton(topic));
try {
System.out.println("--------------userFriend to hbase ---------");
while(true) {
ConsumerRecords<String, String> poll = consumer.poll(Duration.ofMillis(100));
int write = writer.write(poll, target);
System.out.println("写入行数: "+write);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
会发现全都抽象成接口,内部的改动对main方法是完全不影响的。
当要实现将其他表导入HBase中的时候,只要写对应的worker实现类,完成业务逻辑:
1.将Kafka中events_raw写入hbase:EventsHandler
public class EventsHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
Put put = new Put(Bytes.toBytes(split[0].hashCode()));
put.addColumn("schedule".getBytes(),"start_time".getBytes(),split[2].getBytes());
put.addColumn("location".getBytes(),"city".getBytes(),split[3].getBytes());
put.addColumn("location".getBytes(),"state".getBytes(),split[4].getBytes());
put.addColumn("location".getBytes(),"zip".getBytes(),split[5].getBytes());
put.addColumn("location".getBytes(),"country".getBytes(),split[6].getBytes());
put.addColumn("location".getBytes(),"lat".getBytes(),split[7].getBytes());
put.addColumn("location".getBytes(),"lng".getBytes(),split[8].getBytes());
put.addColumn("creator".getBytes(),"user_id".getBytes(),split[1].getBytes());
put.addColumn("remark".getBytes(),"common_words".getBytes(),split[9].getBytes());
datas.add(put);
}
return datas;
}
}
2.将Kafka中train_raw写入hbase:TrainHandler
public class TrainHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
Put put = new Put(Bytes.toBytes((split[0]+split[1]).hashCode()));
put.addColumn("eu".getBytes(),"user_id".getBytes(),split[0].getBytes());
put.addColumn("eu".getBytes(),"event_id".getBytes(),split[1].getBytes());
put.addColumn("eu".getBytes(),"invited".getBytes(),split[2].getBytes());
put.addColumn("eu".getBytes(),"timestamp".getBytes(),split[3].getBytes());
put.addColumn("eu".getBytes(),"interested".getBytes(),split[4].getBytes());
put.addColumn("eu".getBytes(),"not_interested".getBytes(),split[5].getBytes());
datas.add(put);
}
return datas;
}
}
3.将Kafka中users_raw写入hbase:UsersHandler
//user_id,locale,birthyear,gender,joinedAt,location,timezone
public class UsersHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
if(split[0].trim().length()==0){
continue;
}
System.out.println(record);
Put put = new Put(Bytes.toBytes(split[0].hashCode()));
put.addColumn("profile".getBytes(),"birthyear".getBytes(),split[2].getBytes());
put.addColumn("profile".getBytes(),"gender".getBytes(),split[3].getBytes());
put.addColumn("region".getBytes(),"locale".getBytes(),split[1].getBytes());
if(split.length>5){
put.addColumn("region".getBytes(),"location".getBytes(),split[5].getBytes());
}
if (split.length>6){
put.addColumn("region".getBytes(),"timezone".getBytes(),split[6].getBytes());
}
if (split.length>4){
put.addColumn("registration".getBytes(),"joinedAt".getBytes(),split[4].getBytes());
}
datas.add(put);
}
return datas;
}
}
4.将Kafka中user_friends_raw写入hbase:UserFriendsHandler
public class UserFriendsHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> p : records) {
//System.out.println(p.value());
String[] split = p.value().split(",");
Put put = new Put(Bytes.toBytes((split[0] + split[1]).hashCode()));
put.addColumn("uf".getBytes(), "userid".getBytes(), split[0].getBytes());
put.addColumn("uf".getBytes(), "friendid".getBytes(), split[1].getBytes());
datas.add(put);
}
return datas;
}
}
4.将Kafka中event_attendees_raw写入hbase:EventAttendeesHandler
public class EventAttendeesHandlerimplements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
Put put = new Put(Bytes.toBytes(split[0].hashCode()));
put.addColumn("euat".getBytes(),"event".getBytes(),split[0].getBytes());
put.addColumn("euat".getBytes(),"yes".getBytes(),split[1].getBytes());
put.addColumn("euat".getBytes(),"maybe".getBytes(),split[2].getBytes());
put.addColumn("euat".getBytes(),"invited".getBytes(),split[3].getBytes());
put.addColumn("euat".getBytes(),"no".getBytes(),split[4].getBytes());
datas.add(put);
}
return datas;
}
}
main方法:
// train_raw 2 events_db:train
HbaseWorker trainWorker= new HbaseWorker(
"train",
"train_raw",
"events_db:train",
new HBaseWriter(new TrainHandler())
);
trainWorker.fillData();
// event_attendees_raw 2 events_db:event_attendee
HbaseWorker eventAttendeesWorker = new HbaseWorker(
"event_attendees",
"event_attendees_raw",
"events_db:event_attendee",
new HBaseWriter(new EventAttendeesHandler())
);
eventAttendeesWorker.fillData();
// user_friends_raw 2 events_db:user_friend
HbaseWorker userFriendsWorker = new HbaseWorker(
"user_friend",
"user_friends_raw",
"events_db:user_friend",
new HBaseWriter(new UserFriendsHandler())
);
userFriendsWorker.fillData();
// users_raw 2 events_db:users
HbaseWorker usersWorker = new HbaseWorker(
"users",
"users_raw",
"events_db:users",
new HBaseWriter(new UsersHandler())
);
usersWorker.fillData();
// events_raw 2 events_db:events
HbaseWorker eventsWorker = new HbaseWorker(
"events",
"events_raw",
"events_db:events",
new HBaseWriter(new EventsHandler())
);
eventsWorker.fillData();
需要提前在hive中建的表:
动态分区:
hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode=nostrict;
hive> set hive.auto.convert.join=false;
创建数据库
hive> create database eventsODS;
hive> create database eventsDWD;
hive> create database eventsDWS;
hive> create database eventsADS;
use events;
show tables;
ODS_user 源数据层
DWD_user_info
DWS_USER_MESG
hive> create database events;
------------------------------------user_friend----------------------------------
create external table events.hb_user_friend(
row_key string,
user_id string,
friend_id string)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties('hbase.columns.mapping'=':key,
uf:userid,
uf:friendid')
tblproperties('hbase.table.name'='events_db:user_friend');
create table user_friend
stored as ORC AS
select * from hb_user_friend
----------------------------event_attendee-------------------------------------
create external table events.hb_event_attendee(
row_key string,
event_id string,
user_id string,
attend_type string)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties('hbase.columns.mapping'=':key,
euat:eventid,
euat:userid,
euat:state')
tblproperties('hbase.table.name'='events_db:event_attendee');
create table event_attendee
stored as ORC AS
select * from hb_event_attendee;
-------------------------------------------events--------------------------------------
create external table events.hb_events(
event_id string,
start_time string,
city string,
state string,
zip string,
country string,
lat string,
lng string,
user_id string,
common_words string
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties('hbase.columns.mapping'=':key,
schedule:start_time,
location:city,
location:state,
location:zip,
location:country,
location:lat,
location:lng,
creator:user_id,
remark:common_words')
tblproperties('hbase.table.name'='events_db:events');
create table events.events
stored as ORC AS
select * from events.hb_events;
------------------------------------------train-------------------------------
create external table events.hb_train(
row_key string,
user_id string,
event_id string,
invited string,
timestamp string,
interested string,
not_interested string
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties('hbase.columns.mapping'=':key,
eu:user_id,
eu:event_id,
eu:invited,
eu:timestamp,
eu:interested,
eu:not_interested')
tblproperties('hbase.table.name'='events_db:event_train');
create table events.train
stored as ORC AS
select * from events.hb_train
----------------------------------------------users----------------------------------
create external table events.hb_users(
user_id string,
birthyear int,
gender string,
locale string,
location string,
timezone string,
joinedAt string
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties('hbase.columns.mapping'=':key,
profile:birthyear,
profile:gender,
region:locale,
region:location,
region:timezone,
registration:joinedAt')
tblproperties('hbase.table.name'='events_db:users');
create table events.users
stored as ORC AS
select * from events.hb_users;