package autox.xdata.iceberg;
import com.alibaba.fastjson2.JSONArray;
import org.apache.commons.lang.text.StrSubstitutor;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.configuration.*;
import org.apache.flink.streaming.api.environment.*;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableResult;
import javaProto.autox.xdata.RunTaskStatus;
import com.alibaba.fastjson2.JSONObject;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import autox.xdata.http.HttpRequest;
import org.apache.iceberg.aws.s3.S3FileIO;
import org.apache.iceberg.jdbc.JdbcCatalog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.flink.types.Row;
import org.apache.flink.util.CloseableIterator;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
public class QueryIceberg {
public static final Logger logger = LoggerFactory.getLogger("autox.xdata.iceberg.queryIcebergJob");
public static boolean save_result(String service, String url_path, JSONObject res) {
HttpRequest httpRequest = new HttpRequest();
return httpRequest.postJsonData(service + url_path, res.toJSONString());
}
public static void register_functions(StreamTableEnvironment tableEnv) {
tableEnv.createTemporarySystemFunction("DayOrNight", QueryUDF.DayOrNight.class);
tableEnv.createTemporarySystemFunction("DisengageTimes", QueryUDF.DisengageTimes.class);
tableEnv.createTemporarySystemFunction("TagsContain", QueryUDF.TagsContain.class);
tableEnv.createTemporarySystemFunction("EventsTags", QueryUDF.EventsTags.class);
}
public static void main(String[] args) throws Exception {
StopWatch sw = new StopWatch();
// parameters
ParameterTool parameter = ParameterTool.fromArgs(args);
String taskId = parameter.getRequired("taskId");
String jobName = parameter.get("jobName", "jobName_icebergQueryJob_" + taskId);
logger.info(String.format("Parameter: %s, jobName: %s", parameter.getConfiguration(), jobName));
String base64_not_xray_sql = parameter.getRequired("notXraySql");
String base64_xray_where_sql = parameter.getRequired("xrayWhereSql");
String catalogName = parameter.get("catalogName", "xdata");
Double scenario_merge_interval_ms = Double.valueOf(parameter.get("scenarioMergeIntervalMs", "1000")); // 1 s
String service = parameter.get("service", "http://10.10.82.153:5000");
String url_path = parameter.get("url_path", "/scenario_search/result/save");
String conf_path = parameter.getRequired("config");
Integer await_min = parameter.getInt("await_min", 30);
// conf
Map<String, String> conf = ParameterTool.fromPropertiesFile(conf_path).getConfiguration().toMap();
int parallelism = Integer.parseInt(conf.getOrDefault("flink.query.parallelism", "20"));
// init
String origin_xray_columns = "select _xray_id as record, " +
"record_metadata.hostname as hostName, " +
"concat(xray_server_info.domain, '/id?', _xray_id) as xrayUrl, " +
"${tmp}.${res}.startTimeMs / 1000 as startTime, " +
"(${tmp}.${res}.endTimeMs - ${tmp}.${res}.startTimeMs) as duration, " +
"(${tmp}.${res}.startTimeMs / 1000 - record_metadata.time_info.start_timestamp) as relativeTime " +
"from ${tmp}.${res} inner join xraydb.xrayMeta on xraydb.xrayMeta._xray_id = ${tmp}.${res}.xrayId ";
String origin_data_tmp_table_name = "default_catalog.default_database.tmp";
String sink_namespace = "tmp";
String table_name = "query_iceberg_res_" + taskId;
HashMap<String, String> valuesMap = new HashMap<>() {{
put("res", table_name);
put("tmp", sink_namespace);
}};
String xray_columns = new StrSubstitutor(valuesMap).replace(origin_xray_columns);
ParseIcebergData job = new ParseIcebergData();
String not_xray_sql = String.format("insert into %s %s", origin_data_tmp_table_name, job.parse_base64_sql(base64_not_xray_sql));
String xray_sql = xray_columns + job.parse_base64_sql(base64_xray_where_sql);
String drop_table_sql = String.format("drop table if exists %s.%s", sink_namespace, table_name);
logger.info(String.format("jobName: %s, prepare use_time: %dms \n", jobName, sw.getTime()));
sw.reset();
sw.start();
// response msg
String error_msg = "";
Integer taskStatus = RunTaskStatus.FAILED_VALUE;
long execute_sql_used_time_ms = 0;
JSONArray record_list = new JSONArray();
TableResult execute_res = null;
StreamTableEnvironment tableEnv = null;
JSONObject res = new JSONObject();
res.put("taskId", taskId);
try {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// env.setRestartStrategy(RestartStrategies.fixedDelayRestart(
// 3, // number of restart attempts
// Time.of(10, TimeUnit.SECONDS) // delay
// ));
env.setMaxParallelism(parallelism * 3);
// env.setRuntimeMode(RuntimeExecutionMode.BATCH);
env.setParallelism(parallelism);
logger.info("env.getConfiguration:" + env.getConfiguration());
logger.info("env.getConfig():" + env.getConfig());
tableEnv = StreamTableEnvironment.create(env);
register_functions(tableEnv);
tableEnv.executeSql(String.format("CREATE CATALOG %s WITH (" +
"'type'='iceberg'," +
"'catalog-impl'='%s'," +
"'warehouse'='%s'," +
"'io-impl'='%s'," +
"'uri'='%s'," +
"'jdbc.user'='%s'," +
"'jdbc.password'='%s'," +
"'s3.endpoint'='%s'," +
"'s3.access-key-id'='%s'," +
"'s3.secret-access-key'='%s'," +
"'s3.region'='default')",
catalogName, JdbcCatalog.class.getName(),
conf.get("s3.file_location"), S3FileIO.class.getName(),
conf.get("jdbc.uri"), conf.get("jdbc.user"), conf.get("jdbc.password"),
conf.get("s3.endpoint"), conf.get("s3.access-key-id"), conf.get("s3.secret-access-key"))
);
tableEnv.useCatalog(catalogName);
tableEnv.executeSql(drop_table_sql).await(await_min, TimeUnit.SECONDS);
tableEnv.executeSql(String.format("CREATE TABLE %s (name STRING, timestamp_ms DOUBLE) WITH (" +
"'connector' = 'xDataScenarioIcebergTableSink'," +
"'merge-interval-ms' = '%s'," +
"'sink-catalog-id' = '%s'," +
"'sink-namespace' = '%s'," +
"'sink-table-name' = '%s'" +
")",
origin_data_tmp_table_name, scenario_merge_interval_ms, catalogName, sink_namespace, table_name)
);
logger.info("not_xray_sql: " + not_xray_sql);
tableEnv.executeSql(not_xray_sql).await(await_min, TimeUnit.MINUTES);
logger.info("xray_sql: " + xray_sql);
execute_res = tableEnv.executeSql(xray_sql);
record_list = job.parse_iceberg_result(execute_res);
taskStatus = RunTaskStatus.FINISHED_VALUE;
execute_sql_used_time_ms = sw.getTime();
logger.info(String.format("execute finish! jobName: %s, executeSql use_time: %dms \n", jobName, execute_sql_used_time_ms));
} catch (Exception e) {
e.printStackTrace();
error_msg = e.getLocalizedMessage();
}
res.put("taskStatus", taskStatus);
res.put("record", record_list);
res.put("used_time_ms", execute_sql_used_time_ms);
res.put("error_msg", error_msg);
logger.info(String.format("finish! size:%s : jobName: %s, res: %s", record_list.size(), jobName, res));
boolean is_save = save_result(service, url_path, res);
if (is_save) {
logger.info("save_result successfully! start execute sql: " + drop_table_sql);
tableEnv.executeSql(drop_table_sql).await(await_min, TimeUnit.SECONDS);
}
}
}
QueryIceberg 代码备份
最新推荐文章于 2024-11-15 14:47:41 发布
该代码示例展示了如何在Flink流处理环境中使用ApacheIceberg作为数据存储,并执行SQL查询。程序首先定义了一些自定义函数,然后从命令行参数中获取配置信息,如SQL查询和任务ID。接着,它创建了一个Flink流处理环境,注册了自定义函数,并设置了一个Iceberg目录。程序执行SQL插入和查询操作,并将结果保存到指定的服务。如果查询成功,还会清理临时表。
摘要由CSDN通过智能技术生成