Spark Dataset 的一些 api 操作_dataset<row>-CSDN博客

本文链接：https://blog.csdn.net/m0_37739193/article/details/132662326

文章目录

一、给 RDD 添加新列

方式一：使用 selectExpr

Dataset<Row> rowDataset = sourceData
        .selectExpr("rowKey",
                "split(rowKey, '_')[0] as id",
                "split(rowKey, '_')[2] as teacher_id",
                "substr(split(rowKey, '_')[1], 1, 10) as time_stamp",
                "'fwefjalwefkew' as student_id",
                "*");

方式二：将已有的列扩展为新列

Dataset<Row> hehe = haha.withColumn("k_ts", functions.expr("unix_timestamp(Time, 'yyyy-MM-dd HH:mm:ss') as k_ts"));

// 添加一个新的列，计算两个现有列的和
df.withColumn("sum", functions.col("column1").plus(functions.col("column2")))

// 添加一个新的列
df.withColumn("column1", functions.col("column3"));

// 添加一个新的列，使用正则和替换等操作
Dataset<Row> hehe = haha.withColumn("city", functions.regexp_replace(functions.translate(haha.col("p"), "<>", ""),  "[[^\\w]+]", "_"))

// 添加一个新的列
Dataset<Row> hehe = haha.withColumn("itemName", functions.lit("city"))

// 使用UDF添加新的列：
// 定义一个UDF
UDF1<String, String> reverseUDF = new UDF1<String, String>() {
   
    @Override
    public String call(String s) throws Exception {
   
        return new StringBuilder(s).reverse().toString();
    }
};
 
// 注册UDF
spark.udf().register("reverse", reverseUDF, DataTypes.StringType);
 
// 使用UDF添加新的列
df.withColumn("reversedColumn", functions.callUDF("reverse", functions.col("column1")));

实际运用场景1：字符串转换为数组类型

参考：hive处理字符串化数组

-- 要转换的字段数据样例（注：这个字段是 string 类型）
      array       |
------------------|
[3.32, 3.62, 3.59]|

Dataset<Row> hehe = haha.withColumn("array_tmp", functions.expr("split(" + functions.regexp_replace(haha.col("array"), "'^\\\\[|\\\\]$'", "''") + ", ',')"));

-- 拆列的话
for (int i = 0; i < num; i++) {
   
    dataset2 = dataset2.withColumn("vol" + i, functions.expr("array_tmp[" + i + "]").cast(DataTypes.FloatType));
}

-- 最后的效果
vol1|vol2|vol3|
----|----|----|
3.32|3.62|3.59|

坑1：本来想用 cast 直接转换为 array 类型，cast(array_tmp as array<float>)，结果报错 org.apache.spark.sql.AnalysisException: Can't extract value from array_tmp#22547: need struct type but got string;，我在 Starrocks 里这么干是可以的。

坑2：一开始写成这样不行

-- 这个并没有降 [和] 替换掉
haha.withColumn("array_tmp", functions.expr("split(" + functions.regexp_replace(haha.col("array"), "'^\\[|\\]$'", "''") + ", ',')"));

haha.withColumn("array_tmp", functions.expr("split(" + functions.regexp_replace(haha.col("array"), "^\\[|\\]$", "") + ", ',')"));
-- 这个报错：
== SQL ==
split(regexp_replace(array_tmp, ^\[|\]$, ), ',')
--------------------^^^

方式三：使用 Java API 和 JavaRDD 在 Spark SQL 中向数据帧添加新列

在应用 mapPartition 函数后创建一个新的数据框：

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class Handler implements Serializable {
   

    public void handler(Dataset<Row> sourceData) {
   
        Dataset<Row> rowDataset = sourceData
                .where("rowKey = 'abcdefg_123'")
                .selectExpr("split(rowKey, '_')[0] as id",
                        "name",
                        "time")
                .where("name = '小强'")
                .orderBy(functions.col("id").asc(), functions.col("time").desc());

        FlatMapFunction<Iterator<Row>,Row> mapPartitonstoTime = rows->
        {
   
            Int count = 0; // 只能在每个分区内自增，不能保证全局自增
			String startTime = "";
			String endTime = "";
			List<Row> mappedRows=new ArrayList<Row>();
            while(rows.hasNext())
            {
   
                count++;
                Row next = rows.next();
                String id = next.getAs("id");
                if (count == 2) {
   
					startTime = next.getAs("time");
					endTime = next.getAs("time");
                }
                Row mappedRow= RowFactory.create(next.getString(0), next.getString(1), next.getString(2), endTime, startTime);
                mappedRows.add(mappedRow);
            }
            return mappedRows.iterator();
        };

        JavaRDD<Row> sensorDataDoubleRDD=rowDataset.toJavaRDD().mapPartitions(mapPartitonstoTime);

        StructType oldSchema=rowDataset.schema();
        StructType newSchema =oldSchema.add("startTime",DataTypes.StringType,false)
                .add("endTime",DataTypes.StringType,false);

        System.out.println("The new schema is: ");
        newSchema.printTreeString();

        System.out.println("The old schema is: ");
        oldSchema.printTreeString();

        Dataset<Row> sensorDataDoubleDF=spark.createDataFrame(sensorDataDoubleRDD, newSchema);
        sensorDataDoubleDF.show(100, false);
    }
}

打印结果：

The new schema is: 
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- time: string (nullable = true)

The old schema is: 
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- time: string (nullable = true)
 |-- startTime: string (nullable = true)
 |-- endTime: string (nullable = true)

+-----------+---------+----------+----------+----------+
|id         |name     |time      |startTime |endTime   |
+-----------+---------+----------+----------+----------+
|abcdefg_123|xiaoqiang|1693462023|1693462023|1693462023|
|abcdefg_321|xiaoliu  |1693462028|1693462028|1693462028|
+-----------+---------+----------+----------+----------+

参考：
java - 使用 Java API 和 JavaRDD 在 Spark SQL 中向数据帧添加新列
 java.util.Arrays$ArrayList cannot be cast to java.util.Iterator

二、foreachPartition 遍历 Dataset

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import java.io.IOException;
import java.io.Serializable;
import java.util.Iterator;

public class Handler implements Serializable {
   

    public void handler(Dataset<Row> sourceData) {
   
        JavaRDD<Row> dataRDD = rowDataset.toJavaRDD();
        dataRDD.foreachPartition(new VoidFunction<Iterator<Row>>() {
   
            @Override
            public void call(Iterator<Row> rowIterator) throws Exception {
   
                while (rowIterator.hasNext()) {
   
                    Row next = rowIterator.next();
                    String id = next.getAs("id");
                    if (id.equals("123")) {
   
                        String startTime = next.getAs("time");
                        // 其他业务逻辑
                    }
                }
            }
        });

	    // 转换为 lambda 表达式
	    dataRDD.foreachPartition((VoidFunction<Iterator<Row>>) rowIterator -> {
   
            while (rowIterator.hasNext()) {
   
                Row next = rowIterator.next();
                String id = next.getAs("id");
                if (id.equals("123")) {
   
                    String startTime = next.getAs("time");
                    // 其他业务逻辑
                }
            }
        });
    }
}

Hbase 批量删除操作：

    public static void deleteDataBatch(Dataset<Row> dataDataset, String configFile) {
   
        JavaRDD<Row> dataRDD = dataDataset.toJavaRDD();
        dataRDD.foreachPartition((VoidFunction<Iterator<Row>>) rowIterator -> {
   
            List<Delete> deletes = new ArrayList<>();
            while (rowIterator.hasNext()) {
   
                Row next = rowIterator.next();
                String rowKey = next.getAs("rowKey");
                deletes.add(new Delete(Bytes.toBytes(rowKey)));
            }
            Connection connection = HBaseUtil.getHBaseConnect(configFile);
            Table table = connection.getTable(TableName.valueOf("hehe_t"));
            table.delete(deletes);
            table.close();
            connection.close();
        });
    }

注意：不要在 foreachPartition 外面操作否则会失败，错误案例：

    public static void deleteDataBatch(Dataset<Row> dataDataset, String configFile) {
   
        List<Delete> deletes = new ArrayList<>();
        // 创建HBase连接
        Connection connection = getHBaseConnect(configFile);
        Properties properties = PropertiesUtil.getProperties(configFile);
        TableName name=TableName.valueOf(properties.getProperty("hbase.table.name"));
        Table table = null;
        try {
   
            table = connection.getTable(name);
        } catch (IOException e) {
   
            e.printStackTrace();
        }

        JavaRDD<Row> dataRDD = dataDataset.toJavaRDD();
        dataRDD.foreachPartition((VoidFunction<Iterator<Row>>) rowIterator -> {
   
            while (rowIterator.hasNext()) {
   
                Row next = rowIterator.next();
                String rowKey = next.getAs("rowKey");
                deletes.add(new Delete(Bytes.toBytes(rowKey)));
            }
        });

        try {
   
            table.delete(deletes);
            table.close();
            System.out.println("数据删除成功！");
        } catch (IOException e) {
   
            e.printStackTrace();
            System.out.println("数据删除失败功！");
        }
    }

扩展：工作中观察到的总结：

System.out.println 是否会打印在控制台：
SparkStreaming：cluster 模式不会打印（但在Yarn 日志中可以看到答应信息），client 和 local 模式都可以打印。注：client 模式也是部分会打印部分不会打印。
离线非实时程序：cluster 模式不会打印（但在Yarn 日志中可以看到答应信息），client 和 local 模式都可以打印。注：但如果在 foreachPartition 这个方法中写打印代码的话，client 模式不会在控制台打印但是 local 模式却能，很神奇。
进程是否会在 Yarn 应用程序页面显示：
SparkStreaming：cluster 会，client 和 local 模式不会。
离线非实时程序：client 和 cluster 都会，local 模式不会。
命令行 --name 参数和代码中的 setAppName 对比：
SparkStreaming：cluster 模式 --name 优先级高，setAppName 好像没起作用默认展示类名，因为我代码里写的是 .setAppName("heheApp") 但是 Yarn 页面上展示的是 com.xiaoqiang.hehe。
离线非实时程序：client 模式 setAppName 优先级高，cluster 模式 --name 优先级高。
代码 spark 配置项中加入 .master("local[*]") 程序能否启动成功：
SparkStreaming：cluster 模式不会成功，client 能成功。
离线非实时程序：cluster 和 client 模式都不会成功。报错：

24/01/24 10:54:01 ERROR SparkContext: Error initializing SparkContext.
org.apache.spark.SparkException: A master URL must be set in your configuration
        at org.apache.spark.SparkContext.<init>(SparkContext.scala:368)
        at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
        at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:935)
        at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:926)
        at scala.Option.getOrElse(Option.scala:121)
        at