package _20210531.oop.kafkatoHbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.streams.KafkaStreams;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
/**
* @Author Xulihua
* @Date2021/5/31
* @Description 将kafka中topic消费到hbase event_db:user_friend 中
*/
public class UserFriendToHB {
static int num=0;
public static void main(String[] args) {
// kafka消费端配置
final Properties properties = new Properties();
properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.107.103:9092");
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "30000");
// properties.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,"1000"); //心跳
//设置是否自动提交
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 1000);
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
properties.put(ConsumerConfig.GROUP_ID_CONFIG, "user_friend_group"); //组名改了之后就能在 auto_commit的值为true的情况 再次拿到值
KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<String, String>(properties);
kafkaConsumer.subscribe(Collections.singleton("user_friends"));
//配置hbase信息 ,连接hbase 数据库
Configuration conf = HBaseConfiguration.create();
// conf.set("hbase.rootdir","hdfs://192.168.107.103:9000/hbase2");
// conf.set("hbase.zookeeper.quorum","192.168.107.103");
// conf.set("hbase.zookeeper.property.clientPort","2181");
conf.set(HConstants.HBASE_DIR, "hdfs://192.168.107.103:9000/hbase2");
conf.set(HConstants.ZOOKEEPER_QUORUM, "192.168.107.103");
conf.set(HConstants.CLIENT_PORT_STR, "2181");
try {
Connection connection = ConnectionFactory.createConnection(conf);
Table userFriendTable = connection.getTable(TableName.valueOf("event_db:user_friend"));
while (true) {
ConsumerRecords<String, String> poll = kafkaConsumer.poll(100);
List<Put> datas=new ArrayList<>();
for (ConsumerRecord record: poll){
System.out.println(record.value().toString());
String[] split = record.value().toString().split(",");
String str = split[0] + split[1];
int has = str.hashCode();
Put put = new Put(Bytes.toBytes(has));
put.addColumn("uf".getBytes(),"userid".getBytes(),split[0].getBytes()); // 列族 字段 值
put.addColumn("uf".getBytes(),"friendid".getBytes(),split[1].getBytes());// 列族 字段 值
datas.add(put);
}
num +=datas.size();
System.out.println("------------------------------------------num: "+num);
if (datas.size()!=0) {
userFriendTable.put(datas);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
user_friends topic的数据总量30386403
实际读到hbase的数据30279525 ,原因是代码中以hash值作为rowkey ,产生了重复值,而hbase自动去重,相同的rowkey直接覆盖。所以数据变少
//------------------------------20210603-----------------------------------------
将代码中的hashCode去掉之后 导入到hbase,30386403条数据少了16条 可以接受,原因可能(split[0] + split[1] )字段值还是有重复