代码:
[WBQ@westgisB068 ~]$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/18 10:26:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark context Web UI available at http://westgisB068:4040
Spark context available as 'sc' (master = local[*], app id = local-1681784767038).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 3.3.2
/_/
Using Scala version 2.12.15 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_271)
Type in expressions to have them evaluated.
Type :help for more information.
scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession
scala> val dfraw = spark.read.format("csv").option("header",value="true").option("encoding","utf-8").load("file:///home/WBQ/soft/data/traffic/part-*")
dfraw: org.apache.spark.sql.DataFrame = [卡号: string, 交易时间: string ... 2 more fields]
scala> dfraw.show(5)
+-------+--------------------+--------+--------+
| 卡号| 交易时间|线路站点|交易类型|
+-------+--------------------+--------+--------+
|3697647|2018-10-01T18:47:...| 大新|地铁出站|
|3697647|2018-10-01T18:35:...|宝安中心|地铁入站|
|3697647|2018-10-01T13:49:...| 大新|地铁入站|
|3697647|2018-10-01T14:03:...|宝安中心|地铁出站|
|5344820|2018-10-17T09:34:...| 罗湖|地铁入站|
+-------+--------------------+--------+--------+
only showing top 5 rows
scala> val schemas = Seq("cardid","captime","rawstation","trans_type")
schemas: Seq[String] = List(cardid, captime, rawstation, trans_type)
scala> val df01 = dfraw.toDF(schemas:_*)
df01: org.apache.spark.sql.DataFrame = [cardid: string, captime: string ... 2 more fields]
scala> df01.show(5)
+-------+--------------------+----------+----------+
| cardid| captime|rawstation|trans_type|
+-------+--------------------+----------+----------+
|3697647|2018-10-01T18:47:...| 大新| 地铁出站|
|3697647|2018-10-01T18:35:...| 宝安中心| 地铁入站|
|3697647|2018-10-01T13:49:...| 大新| 地铁入站|
|3697647|2018-10-01T14:03:...| 宝安中心| 地铁出站|
|5344820|2018-10-17T09:34:...| 罗湖| 地铁入站|
+-------+--------------------+----------+----------+
only showing top 5 rows
scala> val df02 = df01.filter(col("trans_type").contains("地铁入站")||col("trans_type").contains("地铁出站"))
df02: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cardid: string, captime: string ... 2 more fields]
scala> df02.show(5)
+-------+--------------------+----------+----------+
| cardid| captime|rawstation|trans_type|
+-------+--------------------+----------+----------+
|3697647|2018-10-01T18:47:...| 大新| 地铁出站|
|3697647|2018-10-01T18:35:...| 宝安中心| 地铁入站|
|3697647|2018-10-01T13:49:...| 大新| 地铁入站|
|3697647|2018-10-01T14:03:...| 宝安中心| 地铁出站|
|5344820|2018-10-17T09:34:...| 罗湖| 地铁入站|
+-------+--------------------+----------+----------+
only showing top 5 rows
scala> val df03 = df02.select("cardid","captime").where("rawstation is not null").dropDuplicates("cardid","captime")
df03: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cardid: string, captime: string]
scala> val df03A = df03.where("captime like '2018-10-%'")
df03A: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cardid: string, captime: string]
scala> def replaceStationName(station:String):String={
| var dststation = station
| if(!station.endsWith("站"))
| dststation = station+"站"
| else
| dststation = dststation
|
| if (dststation.equals("马鞍山站"))
| dststation = "马安山站"
| else if (dststation.equals("深圳大学站"))
| dststation = "深大站"
| else if (dststation.equals("?Ⅰ站"))
| dststation = "子岭站"
| else{}
| dststation
| }
replaceStationName: (station: String)String
scala> val addCol_replaceStation = udf(replaceStationName _)
addCol_replaceStation: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4066/783363478@28caba4b,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
scala> df03.show(5)
+-------+--------------------+
| cardid| captime|
+-------+--------------------+
|0000164|2018-10-06T20:53:...|
|0001185|2018-10-05T17:13:...|
|0002117|2018-10-19T07:51:...|
|0002495|2018-10-11T22:11:...|
|0003781|2018-10-19T18:45:...|
+-------+--------------------+
only showing top 5 rows
scala> val df03 = df02.select("cardid","captime","rawstation","trans_type").where("rawstation is not null").dropDuplicates("cardid","captime","rawstation","trans_type")
df03: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cardid: string, captime: string ... 2 more fields]
scala> val df03A = df03.where("captime like '2018-10-%'")
df03A: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cardid: string, captime: string ... 2 more fields]
scala> def replaceStationName(station:String):String={
| var dststation = station
| if(!station.endsWith("站"))
| dststation = station+"站"
| else
| dststation = dststation
|
| if (dststation.equals("马鞍山站"))
| dststation = "马安山站"
| else if (dststation.equals("深圳大学站"))
| dststation = "深大站"
| else if (dststation.equals("?Ⅰ站"))
| dststation = "子岭站"
| else{}
| dststation
| }
replaceStationName: (station: String)String
scala> val addCol_replaceStation = udf(replaceStationName _)
addCol_replaceStation: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4552/1555689875@313cdbb1,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
scala> val df04 = df03.withColumn("station",addCol_replaceStation(df03("rawstation")))
df04: org.apache.spark.sql.DataFrame = [cardid: string, captime: string ... 3 more fields]
scala> def replacegjTime(captime:String):String =
| {
| val glString = captime
| val gjsdf = new SimpleDataFormat("yyyy-MM-ddTHH-mm-ss");
| val dstsdf = new SimpleDataFormat("yyyy-MM-dd HH:mm:ss");
| val dt= gjsdf.prase(gjString)
| val longtime = dt.getTime()
| val dstString = dstsdf.format(longtime);
| dstString
| }
<console>:26: error: not found: type SimpleDateFormat
val gjsdf = new SimpleDateFormat("yyyy-MM-ddTHH-mm-ss");
^
<console>:27: error: not found: type SimpleDateFormat
val dstsdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
^
scala> import java.text.SimpleDateFormat
import java.text.SimpleDateFormat
scala> def replacegjTime(captime:String):String =
| {
| val glString = captime
| val gjsdf = new SimpleDateFormat("yyyy-MM-ddTHH-mm-ss");
| val dstsdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
| val dt= gjsdf.prase(gjString)
| val longtime = dt.getTime()
| val dstString = dstsdf.format(longtime);
| dstString
| }
<console>:29: error: value prase is not a member of java.text.SimpleDateFormat
val dt= gjsdf.prase(gjString)
^
<console>:29: error: not found: value gjString
val dt= gjsdf.prase(gjString)
^
scala> import java.time.*
<console>:24: error: object * is not a member of package java.time
import java.time.*
^
scala> def replacegjTime(captime:String):String =
| {
| val glString = captime
| val gjsdf = new SimpleDateFormat("yyyy-MM-ddTHH-mm-ss");
| val dstsdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
| val dt= gjsdf.parse(glString)
| val longtime = dt.getTime()
| val dstString = dstsdf.format(longtime);
| dstString
| }
replacegjTime: (captime: String)String
scala> val addCol_replacegjTime = udf(replacegjTime _)
addCol_replacegjTime: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4586/1958400275@f8958ec,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
scala> var df05R = df05.withColumn("time",addCol_replacegjTime(df04("captime"))).select("cardid","time","")
<console>:26: error: not found: value df05
var df05R = df05.withColumn("time",addCol_replacegjTime(df04("captime"))).select("cardid","time","")
^
scala> df04.show(5)
+-------+--------------------+----------+----------+----------+
| cardid| captime|rawstation|trans_type| station|
+-------+--------------------+----------+----------+----------+
|0000001|2018-10-31T19:58:...| 龙华| 地铁出站| 龙华站|
|0000003|2018-10-25T18:12:...| 西丽| 地铁入站| 西丽站|
|0000011|2018-10-09T19:31:...| 赤尾| 地铁出站| 赤尾站|
|0000011|2018-10-15T19:06:...| 赤尾| 地铁出站| 赤尾站|
|0000011|2018-10-16T18:17:...| 福田口岸| 地铁入站|福田口岸站|
+-------+--------------------+----------+----------+----------+
only showing top 5 rows
scala> var df05R = df04.withColumn("time",addCol_replacegjTime(df04("captime"))).select("cardid","captime","rawstation","trans_type","station")
df05R: org.apache.spark.sql.DataFrame = [cardid: string, captime: string ... 3 more fields]
scala> df05R.show(5)
+-------+--------------------+----------+----------+----------+
| cardid| captime|rawstation|trans_type| station|
+-------+--------------------+----------+----------+----------+
|0000001|2018-10-31T19:58:...| 龙华| 地铁出站| 龙华站|
|0000003|2018-10-25T18:12:...| 西丽| 地铁入站| 西丽站|
|0000011|2018-10-09T19:31:...| 赤尾| 地铁出站| 赤尾站|
|0000011|2018-10-15T19:06:...| 赤尾| 地铁出站| 赤尾站|
|0000011|2018-10-16T18:17:...| 福田口岸| 地铁入站|福田口岸站|
+-------+--------------------+----------+----------+----------+
only showing top 5 rows
scala> var df05R = df04.withColumn("time",addCol_replacegjTime(df04("captime"))).select("cardid","time","rawstation","trans_type","station")
df05R: org.apache.spark.sql.DataFrame = [cardid: string, time: string ... 3 more fields]
scala>
生成轨迹和站点
[WBQ@westgisB068 ~]$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 10:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark context Web UI available at http://westgisB068:4040
Spark context available as 'sc' (master = local[*], app id = local-1682390468167).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 3.3.2
/_/
Using Scala version 2.12.15 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_271)
Type in expressions to have them evaluated.
Type :help for more information.
scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession
scala> val dfraw = spark.read.format("csv").option("header",value="true").option("encoding","utf-8").load("file:///home/WBQ/soft/data/traffic/part-*")
dfraw: org.apache.spark.sql.DataFrame = [卡号: string, 交易时间: string ... 2 more fields]
scala> dfraw.show(5)
+-------+--------------------+--------+--------+
| 卡号| 交易时间|线路站点|交易类型|
+-------+--------------------+--------+--------+
|3697647|2018-10-01T18:47:...| 大新|地铁出站|
|3697647|2018-10-01T18:35:...|宝安中心|地铁入站|
|3697647|2018-10-01T13:49:...| 大新|地铁入站|
|3697647|2018-10-01T14:03:...|宝安中心|地铁出站|
|5344820|2018-10-17T09:34:...| 罗湖|地铁入站|
+-------+--------------------+--------+--------+
only showing top 5 rows
scala> val schemes = Seq("cardid","captime","rawstation","trans_type")
schemes: Seq[String] = List(cardid, captime, rawstation, trans_type)
scala> val df01 = dfraw.toDF(schemes:_*)
df01: org.apache.spark.sql.DataFrame = [cardid: string, captime: string ... 2 more fields]
scala> import java.text.SimpleDateFormat
import java.text.SimpleDateFormat
scala> var separator = "<>"
separator: String = <>
scala> import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType};
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
scala> val df02 = df01.select(df01("cardid"),concat_ws(separator,df01("captime"),df01("rawstation")).cast(StringType).as("timelocation"))
df02: org.apache.spark.sql.DataFrame = [cardid: string, timelocation: string]
scala> def replaceTime(captime:String):String=
| {
| val gjString = captime
| val gjsdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
| val dstsdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
| val dt = gjsdf.parse(gjString)
| val longtime = dt.getTime()
| val dstString = dstsdf.format(longtime);
| dstString
| }
replaceTime: (captime: String)String
scala> val addCol_replacegjTime = udf(replaceTime _)
addCol_replacegjTime: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$3994/294314442@1a5fdcae,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
scala> var df03 = df01.withColumn("time",addCol_replacegjTime(df01("captime"))).select("cardid","time","rawstation","trans_type")
df03: org.apache.spark.sql.DataFrame = [cardid: string, time: string ... 2 more fields]
scala> df03.show(5)
+-------+-------------------+----------+----------+
| cardid| time|rawstation|trans_type|
+-------+-------------------+----------+----------+
|3697647|2018-10-01 18:47:44| 大新| 地铁出站|
|3697647|2018-10-01 18:35:34| 宝安中心| 地铁入站|
|3697647|2018-10-01 13:49:27| 大新| 地铁入站|
|3697647|2018-10-01 14:03:52| 宝安中心| 地铁出站|
|5344820|2018-10-17 09:34:29| 罗湖| 地铁入站|
+-------+-------------------+----------+----------+
only showing top 5 rows
scala> val df02 = df03.filter(col("trans_type").contains("地铁入站")||col("trans_type").contains("地铁出站"))
df02: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [cardid: string, time: string ... 2 more fields]
scala> df02.show(5)
+-------+-------------------+----------+----------+
| cardid| time|rawstation|trans_type|
+-------+-------------------+----------+----------+
|3697647|2018-10-01 18:47:44| 大新| 地铁出站|
|3697647|2018-10-01 18:35:34| 宝安中心| 地铁入站|
|3697647|2018-10-01 13:49:27| 大新| 地铁入站|
|3697647|2018-10-01 14:03:52| 宝安中心| 地铁出站|
|5344820|2018-10-17 09:34:29| 罗湖| 地铁入站|
+-------+-------------------+----------+----------+
only showing top 5 rows
scala> val df04 = df02.select(df02("cardid"),concat_ws(separator,df02("time"),df02("rawstation").cast(StringType).as("timelocation")))
df04: org.apache.spark.sql.DataFrame = [cardid: string, concat_ws(<>, time, CAST(rawstation AS STRING) AS timelocation): string]
scala> df04.limit(5).collect.foreach(println)
[3697647,2018-10-01 18:47:44<>大新]
[3697647,2018-10-01 18:35:34<>宝安中心]
[3697647,2018-10-01 13:49:27<>大新]
[3697647,2018-10-01 14:03:52<>宝安中心]
[5344820,2018-10-17 09:34:29<>罗湖]
scala> val df04 = df02.select(df02("cardid"),df02("time"))
df04: org.apache.spark.sql.DataFrame = [cardid: string, time: string]
scala> df04.show(5)
+-------+-------------------+
| cardid| time|
+-------+-------------------+
|3697647|2018-10-01 18:47:44|
|3697647|2018-10-01 18:35:34|
|3697647|2018-10-01 13:49:27|
|3697647|2018-10-01 14:03:52|
|5344820|2018-10-17 09:34:29|
+-------+-------------------+
only showing top 5 rows
scala> df04.limit(5).collect.foreach(println)
[3697647,2018-10-01 18:47:44]
[3697647,2018-10-01 18:35:34]
[3697647,2018-10-01 13:49:27]
[3697647,2018-10-01 14:03:52]
[5344820,2018-10-17 09:34:29]
scala> df03.show(5)
+-------+-------------------+----------+----------+
| cardid| time|rawstation|trans_type|
+-------+-------------------+----------+----------+
|3697647|2018-10-01 18:47:44| 大新| 地铁出站|
|3697647|2018-10-01 18:35:34| 宝安中心| 地铁入站|
|3697647|2018-10-01 13:49:27| 大新| 地铁入站|
|3697647|2018-10-01 14:03:52| 宝安中心| 地铁出站|
|5344820|2018-10-17 09:34:29| 罗湖| 地铁入站|
+-------+-------------------+----------+----------+
only showing top 5 rows
scala> val df04 = df02.select(df02("cardid"),concat_ws(separator,df02("time"),df02("rawstation")).cast(StringType).as("timelocation"))
df04: org.apache.spark.sql.DataFrame = [cardid: string, timelocation: string]
scala> df04.show(5)
+-------+--------------------+
| cardid| timelocation|
+-------+--------------------+
|3697647|2018-10-01 18:47:...|
|3697647|2018-10-01 18:35:...|
|3697647|2018-10-01 13:49:...|
|3697647|2018-10-01 14:03:...|
|5344820|2018-10-17 09:34:...|
+-------+--------------------+
only showing top 5 rows
scala> val df05 = df04.groupBy("cardid").agg(collect_set("timelocation"))
df05: org.apache.spark.sql.DataFrame = [cardid: string, collect_set(timelocation): array<string>]
scala> df05.show(5)
+-------+-------------------------+
| cardid|collect_set(timelocation)|
+-------+-------------------------+
|0000029| [2018-10-14 21:36...|
|0000052| [2018-10-22 20:27...|
|0000088| [2018-10-12 18:55...|
|0000102| [2018-10-07 19:18...|
|0000120| [2018-10-20 20:59...|
+-------+-------------------------+
only showing top 5 rows
scala> df05.limit(5).collect.foreach(println)
[0000029,WrappedArray(2018-10-14 21:36:02<>市民中心, 2018-10-06 12:15:49<>后海, 2018-10-14 22:00:30<>黄贝岭, 2018-10-14 19:48:00<>黄贝岭, 2018-10-14 20:11:30<>市民中心)]
[0000052,WrappedArray(2018-10-22 20:27:37<>深圳北站, 2018-10-22 21:12:17<>湖贝)]
[0000088,WrappedArray(2018-10-12 18:55:24<>兴东, 2018-10-12 20:07:09<>翠竹)]
[0000102,WrappedArray(2018-10-07 19:18:53<>深圳北站, 2018-10-07 18:57:32<>西丽, 2018-10-07 13:35:33<>世界之窗, 2018-10-07 13:37:47<>世界之窗)]
[0000120,WrappedArray(2018-10-20 20:59:55<>福田口岸, 2018-10-01 16:49:54<>少年宫, 2018-10-01 21:33:55<>上沙, 2018-10-03 17:23:31<>深圳湾公园, 2018-10-17 19:17:00<>上沙, 2018-10-23 18:38:18<>上沙, 2018-10-01 16:05:17<>上沙, 2018-10-20 10:02:00<>上沙, 2018-10-23 18:57:56<>香梅北, 2018-10-04 06:41:55<>上沙, 2018-10-23 20:01:28<>上沙, 2018-10-03 17:01:36<>上沙, 2018-10-23 19:27:56<>香梅北, 2018-10-04 07:25:41<>深圳北站, 2018-10-01 21:20:01<>车公庙, 2018-10-17 18:39:49<>深圳北站, 2018-10-20 10:21:18<>福田口岸, 2018-10-20 21:20:09<>上沙)]
scala> :quit
[WBQ@westgisB068 ~]$