Spark Broadcast使用

软件版本Spark-2.1.3,编程语言:java

需求:从用户画像中抽出所需的用户标签

memberidlabelid
44548154405020101,05030101,05050101,07010301,07040101,06010101,04050301,05010101,01070101,01130501,02010101,04040101,01120501,04030701
44548173605030101,01130201,09010101,07030202,04030701,02010101,07040101,05010101,04050301,05020101,07010301,01120301,05050101,01070101

数据说明:memberid 为用户唯一标识符,labelid为用户所属的标签,
现在从用户的标签中取几个标签,譬如:05060801,05060701,05060601,02050601,02050501,02050401,11020101,11020102,11020103,11020104,11010101,11010102,11010103,11010104,10010101,10010102,10010103,10010104。
所以采用,使用broadcast过滤

import com.tc.base.AbstractSparkSql;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import scala.reflect.ClassManifestFactory;

import java.io.IOException;
import java.util.*;
....
//选取的标签集
String[] labels = "05060801,05060701,05060601,02050601,02050501,02050401,11020101,11020102,11020103,11020104,11010101,11010102,11010103,11010104,10010101,10010102,10010103,10010104".trim().split(",");

//将标签广播
List<String> labelColums = Arrays.asList(labels);
Broadcast<List<String>> broadcastColumnList = spark.sparkContext().broadcast(labelColums, ClassManifestFactory.classType(List.class));

从hive中读取源数据

//数据格式为上面表格
Dataset<Row> memberDataTable = spark.sql("select memberid,labelid from tmp_train.tmp_xxx where memberid is  not null ");

构建Dataset-encode

    // dataset 的encoder
    private  ExpressionEncoder<Row> getEncoder(List<String> labelColums) {
        StructType structTypemember = new StructType();
        structTypemember = structTypemember.add("memberid", DataTypes.IntegerType, true);
        for (String labelName : labelColums) {
            structTypemember = structTypemember.add(labelName, DataTypes.IntegerType, true);
        }
        ExpressionEncoder<Row> encoder = RowEncoder.apply(structTypemember);
        return encoder;
    }

map过滤

 //构建structtype
 		ExpressionEncoder<Row> encoder = getEncoder(labelColums);
        Dataset<Row> memberTemp = memberDataTable.map((MapFunction<Row, Row>) row -> {
        	//broadcast核心部分
            List<String> columnLabels = broadcastColumnList.value();
            List<Integer> result = new ArrayList<>();
            Integer memberId = row.<Integer>getAs("memberid");
            result.add(memberId);
            Boolean isAllZero = false;
            try {
                List<String> labelList = Arrays.asList(row.<String>getAs("labelid").split(","));
                for (String column : columnLabels) {
                    if (labelList.contains(column)) {
                        result.add(1);
                        isAllZero = true;
                    } else {
                        result.add(0);
                    }
                }
                //如果标签至少有一个存在才输出结果
                if (isAllZero) {
                    return RowFactory.create(result.toArray());
                }
            } catch (Exception e) {
            }
            //如果标签全是0,那么这一行memberid设置为null
            result.set(0, null);
            return RowFactory.create(result.toArray());
        }, encoder).filter(col("memberid").isNotNull());

		memberTemp.show(false);
        memberTemp.printSchema();

构建的data

+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|memberid |05060801|05060701|05060601|02050601|02050501|02050401|11020101|11020102|11020103|11020104|11010101|11010102|11010103|11010104|10010101|10010102|10010103|10010104|
+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|445630465|0       |1       |0       |0       |0       |1       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |
|445646977|0       |1       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445666369|0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445683073|0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445708609|0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |
|445724737|0       |0       |0       |0       |0       |1       |0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445736449|0       |1       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445752385|0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445766209|0       |0       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445786753|0       |0       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445805185|0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |
|445824961|0       |0       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445846849|0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |
|445861441|0       |0       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445879489|0       |0       |0       |0       |0       |1       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445894849|0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |
|445910593|0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445925377|0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
|445940161|0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |
|445959169|0       |0       |0       |0       |0       |1       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |0       |
+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值