spark 算子利用filter 算子过滤
//读取文件
val fileRdd = sc.textFile("/app/data/exam/meituan_waimai_meishi.csv")
//filter 算子过滤val spuRDD = fileRDD.filter(x=>x.startsWith("spu_id")==false)
sparksql 读表的时候直接去除
//写法1
val spuDF = spark.read.format("csv")
.option("header",true)
.option("inferSchema",true)
.load("hdfs://端口/exam/exam1/meituan_waimai_meishi.csv")
//写法2
val rdd = spark.read.option("header","true").
csv("hdfs://192.168.226.131:9000/app/data/exam/meituan_waimai_meishi.csv")
.cache()