Spark常用函数(java)

//窗口row_number使用
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
WindowSpec w = Window.partitionBy("start_station_code", "end_station_code").orderBy(col("line_num").desc_nulls_last());
Dataset<Row> startToEndRank = aggData.withColumn("rank_num", row_number().over(w)).drop("line_num");

//struct和collect_set使用
Dataset<Row> middleStationSet = startToEndRank.groupBy("start_station_code", "end_station_code").agg(collect_set(struct("transfer_arrive_station_code", "transfer_city_id","transfer_leave_station_code", "rank_num")).as("transfer_code_set"));

//selectExpr
Dataset<Row> trainRank = rankData.selectExpr("traffic_type", "from_station_code", "to_station_code", "cast(percent as double)", "cast(rank as int)").repartition(1);


//format格式化,求百分比
Dataset<Row> rankData = searchCount.withColumn("percent", format_number(col("search_num").divide(sum("search_num").over(Window.partitionBy("traffic_type"))).multiply(100), 8))

//相同字段名称join 
Seq<String> seq = ColumnUtil.columnNames("start_city_id,transfer_city_id,end_city_id,first_traffic_type,second_traffic_type");
Dataset<Row> finalResult = avgCityTime.join(minCityTime, seq, "outer");

//不同字段名称join
Dataset<Row> startStationMap = spark.sql("SELECT * FROM base_tcdctrafficwisdomwukongbase.city_station_map where traffic_type='T'").repartition(1);
Dataset<Row> endStationMap = spark.sql("SELECT * FROM base_tcdctrafficwisdomwukongbase.city_station_map where traffic_type='T'").repartition(1);
Dataset<Row> trainWithCityName = trainData.join(startStationMap, trainData.col("from_station_code").equalTo(startStationMap.col("station_code"))).join(endStationMap, trainData.col("to_station_code").equalTo(endStationMap.col("station_code")))
	.select(startStationMap.col("city_name").as("from_city_name"), endStationMap.col("city_name").as("to_city_name"), trainData.col("traffic_type"),
	startStationMap.col("station_name").as("from_station_name"), endStationMap.col("station_name").as("to_station_name"), trainData.col("from_station_code"), trainData.col("to_station_code"), trainData.col("station_num")).repartition(1);

//sql转化
from_unixtime(cast(substring(timestamp,1,10) as bigint),'yyyy-MM-dd HH:mm:ss')
case when hasResult =true then 1 else 0 end as request_true

备注ColumnUtil类:

import org.apache.spark.sql.Dataset;
import scala.Option;
import scala.collection.JavaConversions;
import scala.collection.Seq;

import java.util.Arrays;
import java.util.List;

/**
 * @author allen
 * Created by allen on 25/07/2017.
 */
public final class ColumnUtil {

	/**
	 * 将list<String>转换为Scala Seq<String>,dataset join使用
	 * @param listName
	 * @return
	 */
	public static Option<Seq<String>> columnNames(List<String> listName){
		Seq<String> b=(Seq<String>) listName;
		return Option.apply(b);
	}

	/**
	 * 将String切分为转换为Scala Seq<String>,dataset join使用
	 * @param columnsName
	 * @return
	 */
	public static Seq<String> columnNames(String columnsName){
		List<String> list= Arrays.asList(columnsName.split(","));
		return JavaConversions.asScalaBuffer(list);
	}

	/**
	 * 判断dataset是否含某个column
	 * @param dataset
	 * @param colName
	 * @return
	 */
	public static boolean hasColumn(Dataset dataset, String colName){
		return Arrays.asList(dataset.columns()).contains(colName);
	}
}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值