spark struct java_Spark/Java: Dataframe String column to Struct

One way you could do that is:

Normalize your time using Spark's static function.

Check if your value is in the range using a UDF (user defined functions)

Using static functions:

df = df

.withColumn(

"date",

date_format(col("time"), "yyyy-MM-dd HH:mm:ss.SSSS"))

.withColumn("h", hour(col("date")))

.withColumn("m", minute(col("date")))

.withColumn("s", second(col("date")))

.withColumn("event", expr("h*3600 + m*60 +s"))

.drop("date")

.drop("h")

.drop("m")

.drop("s");

If your dataframe looks like before:

+---+-------------------+-----------------------+

|id |time |range |

+---+-------------------+-----------------------+

|id1|2019-03-11 05:00:00|00h00-07h30;23h30-23h59|

|id2|2019-03-11 09:00:00|00h00-07h30;23h30-23h59|

|id3|2019-03-11 10:30:00|00h00-07h30;23h30-23h59|

+---+-------------------+-----------------------+

After, it should look like:

+---+-------------------+-----------------------+-----+

|id |time |range |event|

+---+-------------------+-----------------------+-----+

|id1|2019-03-11 05:00:00|00h00-07h30;23h30-23h59|18000|

|id2|2019-03-11 09:00:00|00h00-07h30;23h30-23h59|32400|

|id3|2019-03-11 10:30:00|00h00-07h30;23h30-23h59|37800|

+---+-------------------+-----------------------+-----+

Using a UDF:

df = df.withColumn("between",

callUDF("inRange", col("range"), col("event")));

and the result will be:

+---+-------------------+-----------------------+-----+-------+

|id |time |range |event|between|

+---+-------------------+-----------------------+-----+-------+

|id1|2019-03-11 05:00:00|00h00-07h30;23h30-23h59|18000|true |

|id2|2019-03-11 09:00:00|00h00-07h30;23h30-23h59|32400|false |

|id3|2019-03-11 10:30:00|00h00-07h30;23h30-23h59|37800|false |

+---+-------------------+-----------------------+-----+-------+

The InRangeUdf

Your UDF would look like:

package net.jgp.books.sparkInAction.ch14.lab900_in_range;

import org.apache.spark.sql.api.java.UDF2;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

public class InRangeUdf implements UDF2 {

private static Logger log = LoggerFactory

.getLogger(InRangeUdf.class);

private static final long serialVersionUID = -21621751L;

@Override

public Boolean call(String range, Integer event) throws Exception {

log.debug("-> call({}, {})", range, event);

String[] ranges = range.split(";");

for (int i = 0; i < ranges.length; i++) {

log.debug("Processing range #{}: {}", i, ranges[i]);

String[] hours = ranges[i].split("-");

int start =

Integer.valueOf(hours[0].substring(0, 2)) * 3600 +

Integer.valueOf(hours[0].substring(3)) * 60;

int end =

Integer.valueOf(hours[1].substring(0, 2)) * 3600 +

Integer.valueOf(hours[1].substring(3)) * 60;

log.debug("Checking between {} and {}", start, end);

if (event >= start && event <= end) {

return true;

}

}

return false;

}

}

Driver code

Your driver code will look like:

package net.jgp.books.sparkInAction.ch14.lab900_in_range;

import static org.apache.spark.sql.functions.*;

import java.util.ArrayList;

import java.util.List;

import org.apache.spark.sql.Dataset;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.RowFactory;

import org.apache.spark.sql.SparkSession;

import org.apache.spark.sql.types.DataTypes;

import org.apache.spark.sql.types.StructField;

import org.apache.spark.sql.types.StructType;

/**

* Custom UDF to check if in range.

*

* @author jgp

*/

public class InCustomRangeApp {

/**

* main() is your entry point to the application.

*

* @param args

*/

public static void main(String[] args) {

InCustomRangeApp app = new InCustomRangeApp();

app.start();

}

/**

* The processing code.

*/

private void start() {

// Creates a session on a local master

SparkSession spark = SparkSession.builder()

.appName("Custom UDF to check if in range")

.master("local[*]")

.getOrCreate();

spark.udf().register(

"inRange",

new InRangeUdf(),

DataTypes.BooleanType);

Dataset df = createDataframe(spark);

df.show(false);

df = df

.withColumn(

"date",

date_format(col("time"), "yyyy-MM-dd HH:mm:ss.SSSS"))

.withColumn("h", hour(col("date")))

.withColumn("m", minute(col("date")))

.withColumn("s", second(col("date")))

.withColumn("event", expr("h*3600 + m*60 +s"))

.drop("date")

.drop("h")

.drop("m")

.drop("s");

df.show(false);

df = df.withColumn("between",

callUDF("inRange", col("range"), col("event")));

df.show(false);

}

private static Dataset createDataframe(SparkSession spark) {

StructType schema = DataTypes.createStructType(new StructField[] {

DataTypes.createStructField(

"id",

DataTypes.StringType,

false),

DataTypes.createStructField(

"time",

DataTypes.StringType,

false),

DataTypes.createStructField(

"range",

DataTypes.StringType,

false) });

List rows = new ArrayList<>();

rows.add(RowFactory.create("id1", "2019-03-11 05:00:00",

"00h00-07h30;23h30-23h59"));

rows.add(RowFactory.create("id2", "2019-03-11 09:00:00",

"00h00-07h30;23h30-23h59"));

rows.add(RowFactory.create("id3", "2019-03-11 10:30:00",

"00h00-07h30;23h30-23h59"));

return spark.createDataFrame(rows, schema);

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值