目录
1、加载CSV数据源并移除首行
D盘的test/t目录下有文件users.csv,文件预览如下,加载数据到spark并移除首行。
1.1 使用SparkContext
import org.apache.spark.{
SparkConf, SparkContext}
object CsvDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("CsvDemo")
val sc = SparkContext.getOrCreate(conf)
val lines = sc.textFile("D:\\test\\t\\users.csv")
println("lines:"+lines.count())
//方法一
val fields= lines.mapPartitionsWithIndex((i, v) => {
if (i == 0)
v.drop(1)
else
v
}).map(x => x.split(","))
println("fields:"+fields.count())
//方法二(优选)
val fields2 = lines.filter(x=>x.startsWith("user_id")==false).map(x=>x.split(","))
println("fields2:"+fields.count())
}
}
/*输出:
lines:38210
fields:38209
fields2:38209
*/
1.2 使用SparkSession
import org.apache.spark.sql.SparkSession
import org.apache.spark.{
SparkConf, SparkContext, sql}
object CsvDemo {
def main(args: Array[String]):