给你一个几g的数据,你是不容易处理的,一方面加载等处理过程较长,而且还很有可能导致你的电脑死机,或者spark-shell崩溃
有两个方法可以选择:randomSplit filter
方法一randomSplit
scala> val t1=sc.textFile("/media/wangtuntun/DOWNLOAD/AAS_Trip/trip_data_1.csv")
t1: org.apache.spark.rdd.RDD[String] = /media/wangtuntun/DOWNLOAD/AAS_Trip/trip_data_1.csv MapPartitionsRDD[11] at textFile at <console>:27
scala> val t2=t1.randomSplit(Array(0.001,0.999))
t2: Array[org.apache.spark.rdd.RDD[String]] = Array(MapPartitionsRDD[14] at randomSplit at <console>:29, MapPartitionsRDD[15] at randomSplit at <console>:29)
scala> val tLess=t2(0)
tLess: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[14] at randomSplit at <console>:29
scala> tLess.cache
res5: tLess.type = MapPartitionsRDD[14] at randomSplit at <console>:29
scala> tLess.count
res6: Long = 14762
方法二
scala> val split=t1.map(_.split(","))
split: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[18] at map at <console>:29
scala> val filter=split.filter( arr=>arr(8).toInt <100 )
filter: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[19] at filter at <console>:31
有两个方法可以选择:randomSplit filter
方法一randomSplit
scala> val t1=sc.textFile("/media/wangtuntun/DOWNLOAD/AAS_Trip/trip_data_1.csv")
t1: org.apache.spark.rdd.RDD[String] = /media/wangtuntun/DOWNLOAD/AAS_Trip/trip_data_1.csv MapPartitionsRDD[11] at textFile at <console>:27
scala> val t2=t1.randomSplit(Array(0.001,0.999))
t2: Array[org.apache.spark.rdd.RDD[String]] = Array(MapPartitionsRDD[14] at randomSplit at <console>:29, MapPartitionsRDD[15] at randomSplit at <console>:29)
scala> val tLess=t2(0)
tLess: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[14] at randomSplit at <console>:29
scala> tLess.cache
res5: tLess.type = MapPartitionsRDD[14] at randomSplit at <console>:29
scala> tLess.count
res6: Long = 14762
方法二
scala> val split=t1.map(_.split(","))
split: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[18] at map at <console>:29
scala> val filter=split.filter( arr=>arr(8).toInt <100 )
filter: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[19] at filter at <console>:31