//窗口row_number使用
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
WindowSpec w = Window.partitionBy("start_station_code", "end_station_code").orderBy(col("line_num").desc_nulls_last());
Dataset<Row> startToEndRank = aggData.withColumn("rank_num", row_number().over(w)).drop("line_num");
//struct和collect_set使用
Dataset<Row> middleStationSet = startToEndRank.groupBy("start_station_code", "end_station_code").agg(collect_set(struct("transfer_arrive_station_code", "transfer_city_id","transfer_leave_station_code", "rank_num")).as("transfer_code_set"));
//selectExpr
Dataset<Row> trainRank = rankData.selectExpr("traffic_type", "from_station_code", "to_station_code", "cast(percent as double)", "cast(rank as int)").repartition(1);
//format格式化,求百分比
Dataset<Row> rankData = searchCount.withColumn("percent", format_number(col("search_num").divide(sum("search_num").over(Window.partitionBy("traffic_type"))).multiply(100), 8))
//相同字段名称join
Seq<String> seq = ColumnUtil.columnNames("start_city_id,transfer_city_id,end_city_id,first_traffic_type,second_traffic_type");
Dataset<Row> finalResult = avgCityTime.join(minCityTime, seq, "outer");
//不同字段名称join
Dataset<Row> startStationMap = spark.sql("SELECT * FROM base_tcdctrafficwisdomwukongbase.city_station_map where traffic_type='T'").repartition(1);
Dataset<Row> endStationMap = spark.sql("SELECT * FROM base_tcdctrafficwisdomwukongbase.city_station_map where traffic_type='T'").repartition(1);
Dataset<Row> trainWithCityName = trainData.join(startStationMap, trainData.col("from_station_code").equalTo(startStationMap.col("station_code"))).join(endStationMap, trainData.col("to_station_code").equalTo(endStationMap.col("station_code")))
.select(startStationMap.col("city_name").as("from_city_name"), endStationMap.col("city_name").as("to_city_name"), trainData.col("traffic_type"),
startStationMap.col("station_name").as("from_station_name"), endStationMap.col("station_name").as("to_station_name"), trainData.col("from_station_code"), trainData.col("to_station_code"), trainData.col("station_num")).repartition(1);
//sql转化
from_unixtime(cast(substring(timestamp,1,10) as bigint),'yyyy-MM-dd HH:mm:ss')
case when hasResult =true then 1 else 0 end as request_true
备注ColumnUtil类:
import org.apache.spark.sql.Dataset;
import scala.Option;
import scala.collection.JavaConversions;
import scala.collection.Seq;
import java.util.Arrays;
import java.util.List;
/**
* @author allen
* Created by allen on 25/07/2017.
*/
public final class ColumnUtil {
/**
* 将list<String>转换为Scala Seq<String>,dataset join使用
* @param listName
* @return
*/
public static Option<Seq<String>> columnNames(List<String> listName){
Seq<String> b=(Seq<String>) listName;
return Option.apply(b);
}
/**
* 将String切分为转换为Scala Seq<String>,dataset join使用
* @param columnsName
* @return
*/
public static Seq<String> columnNames(String columnsName){
List<String> list= Arrays.asList(columnsName.split(","));
return JavaConversions.asScalaBuffer(list);
}
/**
* 判断dataset是否含某个column
* @param dataset
* @param colName
* @return
*/
public static boolean hasColumn(Dataset dataset, String colName){
return Arrays.asList(dataset.columns()).contains(colName);
}
}