需要文件:
user_friends.csv
文件路径:
/opt/flume/conf/jobkb09/dataSource/userFriend/
/opt/flume/conf/jobkb09/dataChannel/userFriend/
/opt/flume/conf/jobkb09/checkpointFile/userFriend/
一、实现flume读取文件到kafka
1、配置flume
userFriend.sources=userFriendSource
userFriend.channels=userChannel
userFriend.sinks=userFriendSink
userFriend.sources.userFriendSource.type=spooldir
userFriend.sources.userFriendSource.spoolDir=/opt/flume/conf/jobkb09/dataSource/userFriend
userFriend.sources.userFriendSource.deserializer=LINE
userFriend.sources.userFriendSource.deserializer.maxLineLength=320000
userFriend.sources.userFriendSource.interceptors=head_filter
userFriend.sources.userFriendSource.interceptors.head_filter.type=regex_filter
userFriend.sources.userFriendSource.interceptors.head_filter.regex=^user,friends*
userFriend.sources.userFriendSource.interceptors.head_filter.excludeEvents=true
userFriend.sources.userFriendSource.includePattern=userFriend_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
userFriend.channels.userChannel.type=file
userFriend.channels.userChannel.checkpointDir=/opt/flume/conf/jobkb09/checkpointFile/userFriend
userFriend.channels.userChannel.dataDir=/opt/flume/conf/jobkb09/dataChannel/userFriend
userFriend.sinks.userFriendSink.type=org.apache.flume.sink.kafka.KafkaSink
userFriend.sinks.userFriendSink.batchSize=640
userFriend.sinks.userFriendSink.brokerList=192.168.136.10:9092
userFriend.sinks.userFriendSink.topic=user_friends_raw
userFriend.sources.userFriendSource.channels=userChannel
userFriend.sinks.userFriendSink.channel=userChannel
2、把文件传输到对应路径:
cp user_friends.csv /opt/flume/conf/jobkb09/dataSource/userFriend/userFriend_2021-01-06.csv
3、创建topic
kafka-topics.sh --create --zookeeper 192.168.136.10:2181 --topic user_friends_raw --partitions 1 --replication-factor 1
kafka-topics.sh --create --zookeeper 192.168.136.10:2181 --topic user_friends --partitions 1 --replication-factor 1
4、传输数据
flume-ng agent --name userFriend --conf /opt/flume/conf/ --conf-file /opt/flume/conf/jobkb09/user-friends-flume.conf -Dflume.root.logger=INFO,console
5、通过sparkStreaming,通过数据筛选,实现topic之间的传输
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "userFriend2");
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
StreamsBuilder builder = new StreamsBuilder();
builder.stream("user_friends_raw").flatMap((k,v)->{
List<KeyValue<String,String>> list = new ArrayList<>();
String[] info = v.toString().split(",");
if (info.length==2) {
String[] friends = info[1].split("\\s+");
if (info[0].trim().length() > 0) {
for (String friend : friends) {
System.out.println(info[0] + " " + friend);
list.add(new KeyValue<>(null, info[0] + "," + friend));
}
}
}
return list;
}).to("user_friends");
final Topology topo = builder.build();
final KafkaStreams streams = new KafkaStreams(topo, prop);
final CountDownLatch latch = new CountDownLatch(1);
Runtime.getRuntime().addShutdownHook(new Thread("sum2"){
@Override
public void run() {
streams.close();
latch.countDown();
}
});
streams.start();
try {
latch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
二、从Kafka读取数据到HBase
1、在Hbase中创建表
-
a、建立命名空间:create_namespace ‘event_db’。类似于库
-
b、建表:create ‘event_db:user_friend’,‘uf’
user_friend:表名
uf:列簇
2、代码实现
依赖包:
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.0</version>
</dependency>
普通写法
//配置运行环境
final Properties prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 30000);
prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 1000);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
prop.put(ConsumerConfig.GROUP_ID_CONFIG, "userFriendHbase");
//建立kafka连接
final KafkaConsumer<String, String> consumer = new KafkaConsumer<>(prop);
//获取kafka的topic
consumer.subscribe(Collections.singleton("user_friends"));
//获取与Hbase的连接
final Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir","hdfs://192.168.136.10:9000/hbase");
conf.set("hbase.zookeeper.quorum","192.168.136.10");
//在测试中发现,下面这一行如果不写clientPort或者不写这一行代码,数据都可以导入到Hbase。我这里Hbase使用的是外部的zookeeper。
conf.set("hbase.zookeeper.property.clientPort","2181");
final Connection connection = ConnectionFactory.createConnection(conf);
//获取Hbase表格
final Table table = connection.getTable(TableName.valueOf("event_db:user_friend"));
while (true) {
//设置从topic中消费数据的时间
final ConsumerRecords<String, String> poll = consumer.poll(Duration.ofMillis(100));
//对得到的数据进行遍历
final List<Put> list = new ArrayList<Put>();
for (ConsumerRecord<String, String> p : poll) {
// System.out.println(p.value());
final String[] msg = p.value().trim().split(",");
final Put put = new Put(Bytes.toBytes((msg[0] + msg[1]).hashCode()));
//System.out.println(put.toString());
put.addColumn("uf".getBytes(), "user".getBytes(), msg[0].getBytes());
put.addColumn("uf".getBytes(), "friend".getBytes(), msg[1].getBytes());
list.add(put);
}
table.put(list);
}
3、可以去HBase中查看表格信息.
scan.
4、补充,当代码运行时如果出现权限问题,可以在hbase中输入:
grant 'Adminor','RWXCA','events_db:event_attendee'
Adminor:windows管理员名。
高级写法
kafka环境配置
package ToHbase4;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.util.Properties;
/**
* @Author
* @Date 2021/1/11
* @Description
*/
public abstract class KafkaEnvironment implements Demand{
Properties prop = null;
public KafkaEnvironment(String groupName) {
prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 30000);
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
prop.put(ConsumerConfig.GROUP_ID_CONFIG, groupName);
}
}
hbase读取
package ToHbase4;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @Author
* @Date 2021/1/11
* @Description
*/
public class HbaseTable implements IWrite{
Connection connection = null;
Records record;
public HbaseTable(Records record) {
this.record = record;
final Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir","hdfs://192.168.136.10:9000/hbase");
conf.set("hbase.zookeeper.quorum","192.168.136.10");
conf.set("hbase.zookeeper.property.clientPort","2181");
try {
connection = ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public int params(ConsumerRecords<String, String> records, String tableName) {
List<Put> data = new ArrayList<>();
try {
final Table table = connection.getTable(TableName.valueOf(tableName));
data = record.data(records);
table.put(data);
} catch (IOException e) {
e.printStackTrace();
}
return data.size();
}
}
HBase结构
package ToHbase4;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import java.util.ArrayList;
import java.util.List;
/**
* @Author
* @Date 2021/1/11
* @Description
*/
public class tableDemon implements Records {
@Override
public List<Put> data(ConsumerRecords<String, String> records) {
final ArrayList<Put> list = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
final String[] split = record.value().split(",");
final Put put = new Put(Bytes.toBytes((split[0] + split[1]).hashCode()));
put.addColumn("uf".getBytes(), "userId".getBytes(), split[0].getBytes());
put.addColumn("uf".getBytes(), "friend".getBytes(), split[1].getBytes());
list.add(put);
}
return list;
}
}
写入到HBase
package ToHbase4;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.time.Duration;
import java.util.Collections;
/**
* @Author
* @Date 2021/1/11
* @Description
*/
public class KafkaConsumerDemo extends KafkaEnvironment{
IWrite write;
String tableName;
String topic;
public KafkaConsumerDemo( IWrite write,String groupName, String topic,String tableName) {
super(groupName);
this.write = write;
this.tableName = tableName;
this.topic = topic;
}
@Override
public void achive() {
final KafkaConsumer consumer = new KafkaConsumer<>(prop);
consumer.subscribe(Collections.singleton(this.topic));
try {
while (true) {
final ConsumerRecords poll = consumer.poll(Duration.ofMillis(100));
final int params = write.params(poll, this.tableName);
System.out.println("size:" + params);
Thread.sleep(100);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
接口
package ToHbase4;
import org.apache.kafka.clients.consumer.ConsumerRecords;
public interface IWrite {
public int params(ConsumerRecords<String, String> records, String tableName);
}
package ToHbase4;
import org.apache.hadoop.hbase.client.Put;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import java.util.List;
public interface Records {
public List<Put> data(ConsumerRecords<String, String> records);
}
package ToHbase4;
public interface Demand {
public void achive();
}
运行:
package ToHbase4;
/**
* @Author
* @Date 2021/1/11
* @Description
*/
public class Driver {
public static void main(String[] args) {
final tableDemon tableDemon = new tableDemon();
final HbaseTable hbaseTable = new HbaseTable(tableDemon);
final KafkaConsumerDemo demo = new KafkaConsumerDemo(hbaseTable, "qwrq", "user_friends", "events_db:user_friend");
demo.achive();
}
}
针对上述代码,在开发的时候还可以这样改进:
定义一个*.properties文件:
hbase.rootdir=hdfs://192.168.136.30:9000/hbase,hdfs://192.168.136.31:9000/hbase,hdfs://192.168.136.32:9000/hbase
hbase.zookeeper.quorum=192.168.136.30,192.168.136.31,192.168.136.32
hbase.zookeeper.property.clientPort=2181
bootstrap_servers_config=192.168.136.30:9092,192.168.136.31:9092,192.168.136.32:9092
auto_offset_reset_config=earliest
enable_auto_commit_config=false
session_timeout_ms_config=30000
在几个环境配置的类中:
Connection connection = null;
Records record;
Properties prop =null;
public HbaseTable(Records record) {
this.record = record;
prop = new Properties();
try {
prop.load(new FileInputStream("data/file/conf.properties"));
String rootdir = prop.getProperty("hbase.rootdir");
String quorum = prop.getProperty("hbase.zookeeper.quorum");
String clientPort = prop.getProperty("hbase.zookeeper.property.clientPort");
final Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir",rootdir);
conf.set("hbase.zookeeper.quorum",quorum);
conf.set("hbase.zookeeper.property.clientPort",clientPort);
connection = ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public int params(ConsumerRecords<String, String> records, String tableName) {
List<Put> data = new ArrayList<>();
try {
final Table table = connection.getTable(TableName.valueOf(tableName));
data = record.data(records);
table.put(data);
} catch (IOException e) {
e.printStackTrace();
}
return data.size();
}
//protected防止别人随意修改
protected Properties prop = null;
public KafkaEnvironment(String groupName) {
prop = new Properties();
try {
prop.load(new FileInputStream("data/file/conf.properties"));
String bootstrap = prop.getProperty("bootstrap_servers_config");
String reset = prop.getProperty("auto_offset_reset_config");
String commit = prop.getProperty("enable_auto_commit_config");
String timeout = prop.getProperty("session_timeout_ms_config");
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrap);
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, timeout);
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, reset);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, commit);
prop.put(ConsumerConfig.GROUP_ID_CONFIG, groupName);
} catch (IOException e) {
e.printStackTrace();
}
}
从Hbase写到Hive
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
set hive.auto.convert.join=false;
create external table events.hb_user_friend(row_key String,user_id String,friend_id string)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties('hbase.columns.mapping'=':key,uf:userid,uf:friendid')
tblproperties('hbase.table.name'='events_db:user_friend');
//转到新的表,方便查询 存为ORC格式
create table user_friend
stored as ORC AS select *from hb_user_friend;
//删掉原始表
drop table if exists hb_user_friend;