软件版本Spark-2.1.3,编程语言:java
需求:从用户画像中抽出所需的用户标签
memberid | labelid |
---|---|
445481544 | 05020101,05030101,05050101,07010301,07040101,06010101,04050301,05010101,01070101,01130501,02010101,04040101,01120501,04030701 |
445481736 | 05030101,01130201,09010101,07030202,04030701,02010101,07040101,05010101,04050301,05020101,07010301,01120301,05050101,01070101 |
数据说明:memberid 为用户唯一标识符,labelid为用户所属的标签,
现在从用户的标签中取几个标签,譬如:05060801,05060701,05060601,02050601,02050501,02050401,11020101,11020102,11020103,11020104,11010101,11010102,11010103,11010104,10010101,10010102,10010103,10010104。
所以采用,使用broadcast过滤
import com.tc.base.AbstractSparkSql;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import scala.reflect.ClassManifestFactory;
import java.io.IOException;
import java.util.*;
....
//选取的标签集
String[] labels = "05060801,05060701,05060601,02050601,02050501,02050401,11020101,11020102,11020103,11020104,11010101,11010102,11010103,11010104,10010101,10010102,10010103,10010104".trim().split(",");
//将标签广播
List<String> labelColums = Arrays.asList(labels);
Broadcast<List<String>> broadcastColumnList = spark.sparkContext().broadcast(labelColums, ClassManifestFactory.classType(List.class));
从hive中读取源数据
//数据格式为上面表格
Dataset<Row> memberDataTable = spark.sql("select memberid,labelid from tmp_train.tmp_xxx where memberid is not null ");
构建Dataset-encode
// dataset 的encoder
private ExpressionEncoder<Row> getEncoder(List<String> labelColums) {
StructType structTypemember = new StructType();
structTypemember = structTypemember.add("memberid", DataTypes.IntegerType, true);
for (String labelName : labelColums) {
structTypemember = structTypemember.add(labelName, DataTypes.IntegerType, true);
}
ExpressionEncoder<Row> encoder = RowEncoder.apply(structTypemember);
return encoder;
}
map过滤
//构建structtype
ExpressionEncoder<Row> encoder = getEncoder(labelColums);
Dataset<Row> memberTemp = memberDataTable.map((MapFunction<Row, Row>) row -> {
//broadcast核心部分
List<String> columnLabels = broadcastColumnList.value();
List<Integer> result = new ArrayList<>();
Integer memberId = row.<Integer>getAs("memberid");
result.add(memberId);
Boolean isAllZero = false;
try {
List<String> labelList = Arrays.asList(row.<String>getAs("labelid").split(","));
for (String column : columnLabels) {
if (labelList.contains(column)) {
result.add(1);
isAllZero = true;
} else {
result.add(0);
}
}
//如果标签至少有一个存在才输出结果
if (isAllZero) {
return RowFactory.create(result.toArray());
}
} catch (Exception e) {
}
//如果标签全是0,那么这一行memberid设置为null
result.set(0, null);
return RowFactory.create(result.toArray());
}, encoder).filter(col("memberid").isNotNull());
memberTemp.show(false);
memberTemp.printSchema();
构建的data
+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|memberid |05060801|05060701|05060601|02050601|02050501|02050401|11020101|11020102|11020103|11020104|11010101|11010102|11010103|11010104|10010101|10010102|10010103|10010104|
+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|445630465|0 |1 |0 |0 |0 |1 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |
|445646977|0 |1 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445666369|0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445683073|0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445708609|0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |
|445724737|0 |0 |0 |0 |0 |1 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445736449|0 |1 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445752385|0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445766209|0 |0 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445786753|0 |0 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445805185|0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |
|445824961|0 |0 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445846849|0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |
|445861441|0 |0 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445879489|0 |0 |0 |0 |0 |1 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445894849|0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |
|445910593|0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445925377|0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
|445940161|0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |
|445959169|0 |0 |0 |0 |0 |1 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |0 |
+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+