多条collect_list,然后将collect_list的结果concat起来,最初使用的是concat_ws(),但是发现超过4个collect_list就会报错,
select concat_ws("|",
collect_list(concat_ws(',',n_cgi_1,ltencrsrp_1)),
collect_list(concat_ws(',',n_cgi_2,ltencrsrp_2)),
collect_list(concat_ws(',',n_cgi_3,ltencrsrp_3)),
collect_list(concat_ws(',',n_cgi_4,ltencrsrp_4)),
collect_list(concat_ws(',',n_cgi_5,ltencrsrp_5)),
collect_list(concat_ws(',',n_cgi_6,ltencrsrp_6))) as result
from test group by id,name;
于是考虑自定义UDF函数。
collect_list函数在hive中返回值类型为array<T>,对应java的arrayList<T>,但是在写spark UDF时候报错:
Spark java.lang.ClassCastException: scala.collection.mutable.WrappedArray$ofRef cannot be cast to java.util.ArrayList
选择了一个不够牛逼但是很方便的方式,样例代码如下:
package com.kong.test.UDF;
import org.apache.spark.sql.api.java.UDF5;
import scala.collection.mutable.WrappedArray;
public class TestArray implements UDF5<WrappedArray<String>, WrappedArray<String>, WrappedArray<String>, WrappedArray<String>, WrappedArray<String>, String> {
private static final long serialVersionUID = 1L;
//将array中的元素取出来,并以|隔开
public String call(WrappedArray<String> t1, WrappedArray<String> t2, WrappedArray<String> t3,
WrappedArray<String> t4, WrappedArray<String> t5) throws Exception {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < t1.length(); i++) {
String ele = t1.apply(i);
System.out.println(ele);
if(!"".equals(ele)){
sb.append(ele).append("|");
}
}
for (int i = 0; i < t2.length(); i++) {
String ele = t2.apply(i);
System.out.println(ele);
if(!"".equals(ele)){
sb.append(ele).append("|");
}
};
for (int i = 0; i < t3.length(); i++) {
String ele = t3.apply(i);
System.out.println(ele);
if(!"".equals(ele)){
sb.append(ele).append("|");
}
};
for (int i = 0; i < t4.length(); i++) {
String ele = t4.apply(i);
System.out.println(ele);
if(!"".equals(ele)){
sb.append(ele).append("|");
}
};
for (int i = 0; i < t5.length(); i++) {
String ele = t5.apply(i);
System.out.println(ele);
if(!"".equals(ele)){
sb.append(ele).append("|");
}
};
System.out.println(sb.toString());
String res = sb.toString();
String res1 = res.substring(0, res.length()-1);
return res1;
}
}