spark struct java_Spark/Java: Dataframe String column to Struct

最新推荐文章于 2024-06-18 07:45:00 发布

梁培定

最新推荐文章于 2024-06-18 07:45:00 发布

阅读量113

点赞数

文章标签： spark struct java

本文链接：https://blog.csdn.net/weixin_28679635/article/details/114254127

版权

One way you could do that is:

Normalize your time using Spark's static function.

Check if your value is in the range using a UDF (user defined functions)

Using static functions:

df = df

.withColumn(

"date",

date_format(col("time"), "yyyy-MM-dd HH:mm:ss.SSSS"))

.withColumn("h", hour(col("date")))

.withColumn("m", minute(col("date")))

.withColumn("s", second(col("date")))

.withColumn("event", expr("h*3600 + m*60 +s"))

.drop("date")

.drop("h")

.drop("m")

.drop("s");

If your dataframe looks like before:

+---+-------------------+-----------------------+

|id |time |range |

+---+-------------------+-----------------------+

|id1|2019-03-11 05:00:00|00h00-07h30;23h30-23h59|

|id2|2019-03-11 09:00:00|00h00-07h30;23h30-23h59|

|id3|2019-03-11 10:30:00|00h00-07h30;23h30-23h59|

+---+-------------------+-----------------------+

After, it should look like:

+---+-------------------+-----------------------+-----+

+---+-------------------+-----------------------+-----+

|id1|2019-03-11 05:00:00|00h00-07h30;23h30-23h59|18000|

|id2|2019-03-11 09:00:00|00h00-07h30;23h30-23h59|32400|

|id3|2019-03-11 10:30:00|00h00-07h30;23h30-23h59|37800|

+---+-------------------+-----------------------+-----+

Using a UDF:

df = df.withColumn("between",

callUDF("inRange", col("range"), col("event")));

and the result will be:

+---+-------------------+-----------------------+-----+-------+

+---+-------------------+-----------------------+-----+-------+

|id1|2019-03-11 05:00:00|00h00-07h30;23h30-23h59|18000|true |

|id2|2019-03-11 09:00:00|00h00-07h30;23h30-23h59|32400|false |

|id3|2019-03-11 10:30:00|00h00-07h30;23h30-23h59|37800|false |

+---+-------------------+-----------------------+-----+-------+

The InRangeUdf

Your UDF would look like:

package net.jgp.books.sparkInAction.ch14.lab900_in_range;

import org.apache.spark.sql.api.java.UDF2;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

public class InRangeUdf implements UDF2 {

private static Logger log = LoggerFactory

.getLogger(InRangeUdf.class);

private static final long serialVersionUID = -21621751L;

@Override

public Boolean call(String range, Integer event) throws Exception {

log.debug("-> call({}, {})", range, event);

String[] ranges = range.split(";");

for (int i = 0; i < ranges.length; i++) {

log.debug("Processing range #{}: {}", i, ranges[i]);

String[] hours = ranges[i].split("-");

int start =

Integer.valueOf(hours[0].substring(0, 2)) * 3600 +

Integer.valueOf(hours[0].substring(3)) * 60;

int end =

Integer.valueOf(hours[1].substring(0, 2)) * 3600 +

Integer.valueOf(hours[1].substring(3)) * 60;

log.debug("Checking between {} and {}", start, end);

if (event >= start && event <= end) {

return true;

}

return false;

}

Driver code

Your driver code will look like:

package net.jgp.books.sparkInAction.ch14.lab900_in_range;

import static org.apache.spark.sql.functions.*;

import java.util.ArrayList;

import java.util.List;

import org.apache.spark.sql.Dataset;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.RowFactory;

import org.apache.spark.sql.SparkSession;

import org.apache.spark.sql.types.DataTypes;

import org.apache.spark.sql.types.StructField;

import org.apache.spark.sql.types.StructType;

/**

* Custom UDF to check if in range.

* @author jgp

public class InCustomRangeApp {

/**

* main() is your entry point to the application.

* @param args

public static void main(String[] args) {

InCustomRangeApp app = new InCustomRangeApp();

app.start();

}

/**

* The processing code.

private void start() {

// Creates a session on a local master

SparkSession spark = SparkSession.builder()

.appName("Custom UDF to check if in range")

.master("local[*]")

.getOrCreate();

spark.udf().register(

"inRange",

new InRangeUdf(),

DataTypes.BooleanType);

Dataset df = createDataframe(spark);

df.show(false);

df = df

.withColumn(

"date",

date_format(col("time"), "yyyy-MM-dd HH:mm:ss.SSSS"))

.withColumn("h", hour(col("date")))

.withColumn("m", minute(col("date")))

.withColumn("s", second(col("date")))

.withColumn("event", expr("h*3600 + m*60 +s"))

.drop("date")

.drop("h")

.drop("m")

.drop("s");

df.show(false);

df = df.withColumn("between",

callUDF("inRange", col("range"), col("event")));

df.show(false);

}

private static Dataset createDataframe(SparkSession spark) {

StructType schema = DataTypes.createStructType(new StructField[] {

DataTypes.createStructField(

"id",

DataTypes.StringType,

false),

DataTypes.createStructField(

"time",

DataTypes.StringType,

false),

DataTypes.createStructField(

"range",

DataTypes.StringType,

false) });

List rows = new ArrayList<>();

rows.add(RowFactory.create("id1", "2019-03-11 05:00:00",

"00h00-07h30;23h30-23h59"));

rows.add(RowFactory.create("id2", "2019-03-11 09:00:00",

"00h00-07h30;23h30-23h59"));

rows.add(RowFactory.create("id3", "2019-03-11 10:30:00",

"00h00-07h30;23h30-23h59"));

return spark.createDataFrame(rows, schema);

}

梁培定

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫