实时数仓和离线数仓一样,为了问题追溯,也需要将业务数据持续化到数据库中,这里,将业务系统的数据生产到kafka,然后利用hive去消费kafka中的数据,其中不做任何处理,将全部数据入库,相当于离线数仓中的ODS层。
接下来,直接上代码了,代码中都有详细的注释。
第一部分,主程序:
import com.migudm.flink001.utils.KafkaSourceHelper;
import com.migudm.flink001.utils.baseInfo;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.bridge.java.StreamStatementSet;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.Catalog;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import java.util.Properties;
public class kafkasinkhive {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings mySetting = EnvironmentSettings
.newInstance()
.inStreamingMode()
.build();
StreamTableEnvironment t_env = StreamTableEnvironment.create(env, mySetting);
StreamStatementSet set = t_env.createStatementSet();
Properties properties = baseInfo.initBaseProp();
String checkpointStorage = properties.getProperty("checkpointStorage");
String kafkaServers = properties.getProperty("bootstrap.servers");
/*
正常程序修改如下三个变量 topName groupIdName databaseName
*/
// 设置topic名称
String topName = "testjson0418";
// 设置groupid名称
String groupIdName = "testjson041800";
// 设置hive数据库名称
String databaseName = "hive_pub";
String sourceName = "kafka_" + topName;
env.setStateBackend(new EmbeddedRocksDBStateBackend(true));
env.getCheckpointConfig().setCheckpointStorage(checkpointStorage);
//3分钟触发一次checkpoint
env.enableCheckpointing(1 * 60 * 1000);
//设置checkpoint模式精准一次
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
//两次checkpoint之间至少4秒钟
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(2 * 60 * 1000);
//超时时间10分钟
env.getCheckpointConfig().setCheckpointTimeout(3 * 60 * 1000);
//设置checkpoint容忍失败的次数
env.getCheckpointConfig().setTolerableCheckpointFailureNumber(3);
// env.setParallelism(1);
// 在取消作业时保留检查点。请注意,在这种情况下,您必须在取消后手动清理检查点状态。
env.getCheckpointConfig().setExternalizedCheckpointCleanup(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//重启策略
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, org.apache.flink.api.common.time.Time.seconds(10)));
KafkaSource<String> kafkaSource = KafkaSourceHelper.flink_KafkaSource(String.class, topName, groupIdName);
DataStream<String> input = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), sourceName);
//加载配置文件参数
String hiveCatalogName = "Hive";
String tableName = properties.getProperty("table_name");
t_env.getConfig().setSqlDialect(SqlDialect.HIVE);
Catalog hiveCatalog = new HiveCatalog(hiveCatalogName, databaseName, properties.getProperty("hiveConfDir"));
t_env.registerCatalog(hiveCatalogName, hiveCatalog);
t_env.useCatalog(hiveCatalogName);//新建小时分区表
//分区表属性
String tblProperties = "PARTITIONED BY (ts_date STRING, ts_hour STRING) STORED AS TEXTFILE TBLPROPERTIES ("