1.第一步创建hive表并且添加分区,就当前时间为例,
若今天是20230608那么dt的时间也是20230608,当然在实际的生产中,可以写个定时任务,定时添加;
create external table basic.monitor_car_data (
carNo string,
speed double,
`timeStamp` bigint
)
partitioned by(dt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
# 添加分区
alter table monitor_car_data add partition (dt='20230607') location '/user/hive/warehouse/basic.db/monitor_car_data/20230607'
第二步:写代码 增加pom.xml文件,代码中也有kafka的数据源配置,也可以参考,IP地址是我的虚拟机地址,没连接外网无法登陆; 其次,就是最重要的 checkpoint 问题:
1.生产中一定增加checkpoint 存储的目录.
2.合理设置checkpoint时间,时间越短产生的小文件越多.
3.checkpoint一定要有不然数据会出现 .part-0-0.inprogress 现象.
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>1.13.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_2.11</artifactId>
<version>1.8.0</version>
</dependency>
public class Kafka2HDFS {
public static void main(String[] args) throws Exception {
// flink 的配置信息
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 设置检查点 检查点:就是回滚的时间设置 设置回滚的时间产生的小文件越多,生产中尽量设置大些
env.enableCheckpointing(200L);
// kafka的配置信息
// Properties prop = new Properties();
// prop.put("bootstrap.servers","192.168.88.151:9092");
// prop.put("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
// prop.put("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
// prop.put("auto.offset.reset","latest");
// prop.put("group.id","test-1");
// String topic ="topic-test";
// FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(topic, new SimpleStringSchema(), prop);
// DataStreamSource<String> streamSource = env.addSource(consumer);
// 设置当前用户
System.setProperty("HADOOP_USER_NAME","root");
// 模拟数据
DataStreamSource<String> streamSource = env.socketTextStream("192.168.88.151", 7777);
streamSource.print("source_");
// 处理数据
SingleOutputStreamOperator<String> streamOperator = streamSource.flatMap(new FlatMapFunction<String, CarInfo>() {
@Override
public void flatMap(String s, Collector<CarInfo> collector) throws Exception {
String[] s1 = s.split(" ");
collector.collect(new CarInfo(s1[0], Double.valueOf(s1[1]), Long.valueOf(s1[2])));
}
}).map(new MapFunction<CarInfo, String>() {
@Override
public String map(CarInfo carInfo) throws Exception {
return carInfo.getCarNo()+","+carInfo.getSpeed()+","+carInfo.getTimestamp();
}
});
// 设置回滚 文件大小 创建新文件条件
DefaultRollingPolicy<String, String> defaultRollingPolicy = DefaultRollingPolicy.builder()
.withInactivityInterval(TimeUnit.SECONDS.toMillis(10)) //10s空闲,就滚动写入新的文件
.withRolloverInterval(TimeUnit.SECONDS.toMillis(30)) //不论是否空闲,超过30秒就写入新文件,默认60s。这里设置为30S
.withMaxPartSize(1024 * 1024 * 1024) // 设置每个文件的最大大小 ,默认是128M。这里设置为1G
.build();
// 设置文件前后缀
OutputFileConfig fileConfig = OutputFileConfig
.builder()
.withPartPrefix("data")
.withPartSuffix(".txt")
.build();
// 配置hdfs 信息
final StreamingFileSink<String> sink = StreamingFileSink
.forRowFormat(new Path("hdfs://192.168.88.151:8020/user/hive/warehouse/basic.db/monitor_car_data/"), new SimpleStringEncoder<String>("UTF-8"))//设置文件路径,以及文件中的编码格式
.withBucketAssigner(new DateTimeBucketAssigner<>("yyyyMMdd", ZoneId.of("Asia/Shanghai")))//设置自定义分桶
.withRollingPolicy(defaultRollingPolicy)//设置文件滚动条件
.withOutputFileConfig(fileConfig)
.build();
streamOperator.print("map_");
streamOperator.addSink(sink).setParallelism(1);
env.execute();
}
}
解决问题: 解决直接向hive直接写入数据,再不需要配置hive配置文件; 下一个案例是 flink写入hive.