原始数据与结构
+-------------+----------------+-----------+---------------------+----------------+---------+
|start_city_id|transfer_city_id|end_city_id|transfer_traffic_type|end_traffic_type|total_num|
+-------------+----------------+-----------+---------------------+----------------+---------+
| 1| 18| 36| T| T| 7|
| 1| 18| 59| T| T| 33|
| 1| 18| 67| T| T| 81|
| 1| 18| 73| T| T| 3|
| 1| 18| 74| T| T| 6|
| 1| 18| 80| T| T| 7|
| 1| 18| 82| T| T| 6|
| 1| 18| 104| T| T| 1|
| 1| 18| 117| T| T| 1|
| 1| 18| 121| T| T| 2|
| 1| 18| 132| T| T| 11|
| 1| 18| 141| T| T| 23|
| 1| 18| 143| T| T| 6|
| 1| 18| 146| T| T| 1|
| 1| 18| 175| T| T| 35|
| 1| 18| 178| T| T| 7|
| 1| 18| 179| T| T| 119|
| 1| 18| 180| T| T| 66|
| 1| 18| 183| T| T| 2|
| 1| 18| 185| T| T| 15|
+-------------+----------------+-----------+---------------------+----------------+---------+
root
|-- start_city_id: integer (nullable = true)
|-- transfer_city_id: integer (nullable = true)
|-- end_city_id: integer (nullable = true)
|-- transfer_traffic_type: string (nullable = true)
|-- end_traffic_type: string (nullable = true)
|-- total_num: integer (nullable = true)
字段说明:start_city_id为出发城市id,transfer_city_id为中转城市id,end_city_id为到达城市id,transfer_traffic_type为前半段交通类型,end_traffic_type后半段交通类型,total_num为该线路成单数。
现在统计起始站,中转城市按订单数优先级。
Dataset<Row> result = tableData.groupBy("start_city_id", "end_city_id", "transfer_traffic_type", "end_traffic_type").
agg(functions.collect_set(struct("transfer_city_id", "total_num")).as("transfer_set"));
result.show(false);
数据格式为
+-------------+-----------+---------------------+----------------+------------------------------------------------------+
|start_city_id|end_city_id|transfer_traffic_type|end_traffic_type|transfer_set |
+-------------+-----------+---------------------+----------------+------------------------------------------------------+
|1 |49 |T |T |[[36,9]] |
|1 |105 |T |T |[[36,25]] |
|1 |178 |T |T |[[18,7]] |
|1 |179 |T |T |[[192,3], [18,119], [175,10]] |
|1 |188 |T |T |[[175,51], [18,36], [192,2], [1462,1]] |
|1 |202 |T |T |[[59,2], [74,1]] |
|1 |216 |T |T |[[121,2], [142,2], [145,8], [36,133], [18,8], [244,8]]|
|1 |252 |T |T |[[101,12], [120,1], [82,2]] |
|1 |265 |T |T |[[36,1], [1502,18], [1568,20]] |
|1 |280 |T |T |[[216,31], [36,2]] |
|1 |298 |T |T |[[330,4], [314,2], [284,1]] |
|1 |299 |T |T |[[203,1], [216,38], [36,3], [227,3]] |
|1 |367 |T |T |[[3070,2]] |
|1 |376 |T |T |[[384,9], [426,2], [371,1]] |
|1 |384 |T |T |[[518,6], [463,8], [59,24], [574,1]] |
|1 |396 |T |T |[[357,4], [347,1]] |
|1 |397 |T |T |[[347,9]] |
|1 |407 |T |T |[[667,4]] |
|1 |415 |T |T |[[667,13], [648,1]] |
|1 |429 |T |T |[[426,9]] |
+-------------+-----------+---------------------+----------------+------------------------------------------------------+
root
|-- start_city_id: integer (nullable = true)
|-- end_city_id: integer (nullable = true)
|-- transfer_traffic_type: string (nullable = true)
|-- end_traffic_type: string (nullable = true)
|-- transfer_set: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- transfer_city_id: integer (nullable = true)
| | |-- total_num: integer (nullable = true)
发现[[36,1], [1502,18], [1568,20]] 并没有按total_num降序排序。
所以自定义函数udf函数排序,定义RowSortUDF继承UDF1 。
import org.apache.spark.sql.Row;
import org.apache.spark.sql.api.java.UDF1;
import scala.collection.JavaConversions;
import scala.collection.mutable.WrappedArray;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class RowSortUDF implements UDF1<WrappedArray<Row>, String> {
private Comparator<Row> comparator;
public RowSortUDF(Comparator comparator) {
this.comparator=comparator;
}
@Override
public String call(WrappedArray<Row> rowWrappedArray) throws Exception {
List<Row> valueList = new ArrayList<Row>(JavaConversions.<Row>seqAsJavaList(rowWrappedArray));
valueList.sort(comparator);
List<String> strList=new ArrayList<String>();
for(Row row: valueList){
strList.add(row.mkString(":"));
}
return String.join(",", strList);
}
}
import org.apache.spark.sql.Row;
import java.io.Serializable;
import java.util.Comparator;
/**
* 比较 122:2 和12:1
* 从大到小
*/
public class ComparatorRow implements Comparator<Row>, Serializable {
String sortName;
public ComparatorRow(String filedName) {
this.sortName = filedName;
}
@Override
public int compare(Row o1, Row o2) {
Integer value1 = o1.<Integer>getAs(sortName);
Integer value2 = o2.<Integer>getAs(sortName);
if (value1.equals(value2)) {
return 0;
} else if (value1 > value2) {
return -1;
} else {
return 1;
}
}
}
主程序udf调用
UDF1<WrappedArray<Row>, String> sortSet = new RowSortUDF(new ComparatorRow("total_num"));
spark.udf().register("sort_set", sortSet, DataTypes.StringType);
Dataset<Row> saveResult = result.select(col("start_city_id"), col("end_city_id"), col("transfer_traffic_type"), col("end_traffic_type"),
callUDF("sort_set", col("transfer_set")).as("transfer_city"));
saveResult.show(false);
结果数据展示,按totol_num排序,而且把transfer_set的array类型变成string
+-------------+-----------+---------------------+----------------+-----------------------------------+
|start_city_id|end_city_id|transfer_traffic_type|end_traffic_type|transfer_city |
+-------------+-----------+---------------------+----------------+-----------------------------------+
|1 |49 |T |T |36:9 |
|1 |105 |T |T |36:25 |
|1 |178 |T |T |18:7 |
|1 |179 |T |T |18:119,175:10,192:3 |
|1 |188 |T |T |175:51,18:36,192:2,1462:1 |
|1 |202 |T |T |59:2,74:1 |
|1 |216 |T |T |36:133,145:8,18:8,244:8,121:2,142:2|
|1 |252 |T |T |101:12,82:2,120:1 |
|1 |265 |T |T |1568:20,1502:18,36:1 |
|1 |280 |T |T |216:31,36:2 |
|1 |298 |T |T |330:4,314:2,284:1 |
|1 |299 |T |T |216:38,36:3,227:3,203:1 |
|1 |367 |T |T |3070:2 |
|1 |376 |T |T |384:9,426:2,371:1 |
|1 |384 |T |T |59:24,463:8,518:6,574:1 |
|1 |396 |T |T |357:4,347:1 |
|1 |397 |T |T |347:9 |
|1 |407 |T |T |667:4 |
|1 |415 |T |T |667:13,648:1 |
|1 |429 |T |T |426:9 |
+-------------+-----------+---------------------+----------------+-----------------------------------+