项目场景:
在工作场景中,应用组需要页面上一个指标所显示的全部数据存放在一条数据中来减少数据传输所占用的io.
原本一条指标的数据的值一条是 (key, value) 类型
示例: xxxx1001 ,(20221203:8.96) xxxx1001 ,(20230128:5.46) 这样的. 分号前面是这条指标的业务时间,而后面的则是这条指标的值.
现在要修改成所有的值都合并在一个指标下面 例如:xxxx1001 ,(20221203:8.96,20230128:5.46) 这样.
根据数据组同事的讨论,现需要设计一个udf来满足需求.
代码示例
这个udf要满足一下几个需求:
1.满足新写入的日期的值替换老的日期的值
2.满足日期为空时不进行合并
3.把合并后的数据组合成一个值
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.log4j.Logger;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
public class formastring extends UDF {
public formastring(){}
private static Logger logger = Logger.getLogger(formastring.class);
public String evaluate(String strnew,String strold){
if(StringUtils.isBlank(strold) && StringUtils.isBlank(strnew) ){
return "";
} else if (StringUtils.isBlank(strold)) {
return strnew;
}else if (StringUtils.isBlank(strnew)) {
return strold;
}else {
Map collectold = null;
try {
Map<String, Double> collectnew = (Map<String, Double>) Arrays.asList(strnew.split(",")).stream().map((x) -> {
return x.split(":");
}).collect(Collectors.toMap((e) -> {
return e[0];
}, (e) -> {
return Double.parseDouble(e[1]);
},(v1,v2) ->{
return v2;
}));
collectold= (Map<String, Double>) Arrays.asList(strold.split(",")).stream().map((x) -> {
return x.split(":");
}).collect(Collectors.toMap((e) -> {
return e[0];
}, (e) -> {
return Double.parseDouble(e[1]);
}));
collectold.putAll(collectnew);
}catch (Exception e){
logger.info("异常数据"+strnew+""+strold);
e.printStackTrace();
return "a";
}
return collectold.toString().replaceAll("([{}])", "");
}
}
public static void main(String[] args){
formastring udf =new formastring();
// String strnew = "20220101:1.360X10^4,20220103:1";
// String strold = "20220101:1,20220102:1";
String strnew = "20221203:8.96,20230128:8.96";
String strold = "20221204:105.000,20221203:106.000";
System.out.println(udf.evaluate(strnew,strold));
}
}
返回结果:
20230128=8.96, 20221204=105.0, 20221203=8.96
解决方案:
把这个udf导入进hive后,就可以直接使用了,取名为mapmerge函数.
下面是spark版的udf函数
import java.util.Arrays;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.shaded.org.eclipse.jetty.websocket.common.frames.DataFrame;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class sparkudf {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.appName("MyUDF")
.master("local[*]")
.getOrCreate();
// 创建示例数据
StructType schema1 = DataTypes.createStructType(Arrays.asList(
DataTypes.createStructField("key", DataTypes.StringType, true),
DataTypes.createStructField("value", DataTypes.StringType, true),
DataTypes.createStructField("data_source", DataTypes.StringType, true),
DataTypes.createStructField("value_gather_time", DataTypes.StringType, true)
));
Dataset<Row> data1 = spark.createDataFrame(Arrays.asList(
RowFactory.create("puyi000001","20220101:1,20220102:1","wind","2023-03-20 10:57:38")
), schema1);
StructType schema2 = DataTypes.createStructType(Arrays.asList(
DataTypes.createStructField("key", DataTypes.StringType, true),
DataTypes.createStructField("value", DataTypes.StringType, true)
));
Dataset<Row> data2 = spark.createDataFrame(Arrays.asList(
RowFactory.create("puyi000001","20220101:1.3"),
RowFactory.create("puyi000001","20220104:2"),
RowFactory.create("puyi000002","20220101:1")
), schema2);
// 创建udf
spark.udf().register("formastring", (String strnew, String strold) -> {
if (strnew == null || strnew.isEmpty()) {
return strold;
} else if (strold == null || strold.isEmpty()) {
return strnew;
} else {
Map<String, Double> collectnew = Arrays.stream(strnew.split(","))
.map(s -> s.split(":"))
.collect(Collectors.toMap(
a -> a[0], a -> Double.parseDouble(a[1])
));
Map<String, Double> collectold = Arrays.stream(strold.split(","))
.map(s -> s.split(":"))
.collect(Collectors.toMap(
a -> a[0], a -> Double.parseDouble(a[1])
));
collectold.putAll(collectnew);
return collectold.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.map(e -> e.getKey() + ":" + e.getValue())
.collect(Collectors.joining(",", "", ""));
}
}, DataTypes.StringType);
// 使用 UDF
data1.createOrReplaceTempView("my_table");
data2.createOrReplaceTempView("my_table2");
Dataset<Row> result = spark.sql(
"SELECT a.key, formastring(concat_ws(',', collect_list(a.value)), b.value) as result " +
"FROM my_table2 a LEFT JOIN my_table b ON a.key = b.key " +
"GROUP BY a.key, b.value"
);
result.show(100000, false);
spark.stop();
}
}