一:读取csv文件
1.SparkContext读取
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("readcsv")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val lines: RDD[String] = sc.textFile("in/users.csv")
lines.glom().foreach(println)
去除首行字段
(1)过滤器过滤掉
val lines1: RDD[Array[String]] = lines.filter(x=>x.startWith(user_id)==false).map(x=>x.split(","))
lines1.glom().foreach(x=>println(x.toList))
(2)mapPartitionsWithInde
val lines2: RDD[String] = lines.mapPartitionsWithIndex((index,values)=>{
if(index == 0){
values.drop(1)
}else values
})
println(lines2.count())
2.SparkSession读取
val spark: SparkSession = SparkSession.builder().Master("local[*]").appName("readcsv").getOrCreate()
val df: DataFrame = spark.read.format("csv")
.option("header",true).load("in/users.csv")
df.show()
二:读取json文件
请参考文章:http://t.csdnimg.cn/Xd8xx
三:dataSet读取RDD文件
首先有一个txt文件
1 zhangsan 25
2 lisi 27
3 wangwu 36
将文件转换成数组
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("sparksqldemo").getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
val peopleRDD: RDD[String] = sc.textFile("in/people.txt")
val people2RDD: RDD[(String, String, Int)] = peopleRDD.map(x => {
val strings: Array[String] = x.split(" ")
(strings(0), strings(1), strings(2).toInt)
})
people2RDD.collect().foreach(println)
第一种方式:
val frame: DataFrame = spark.createDataset(people2RDD).toDF("id","name","age")
frame.show()
第二种方式:
val fields = Array(
StructField("id", IntegerType),
StructField("name", StringType),
StructField("age", IntegerType)
)
val schema: StructType = StructType(fields)
val rowRDD: RDD[Row] = peopleRDD.map(x => {
val strings: Array[String] = x.split(" ")
Row(strings(0).toInt, strings(1), strings(2).toInt)
})
val frame1: DataFrame = spark.createDataFrame(rowRDD,schema)
frame1.show()
展示年龄大于30的姓名
frame1.filter(frame1("age")>30).select("name").show()
frame1.filter("age>30").select("name").show()
frame1.filter($"age">30).select("name").show()
frame1.filter(col("age")>30).select("name").show()
--上面方法需要在顶部import org.apache.spark.sql.functions._
frame1.createOrReplaceTempView("people")
val frame1b: DataFrame = spark.sql("select name from people where age > 30")
frame1b.show()
四:创建dataSet的方式
1.
val ds: Dataset[Int] = spark.createDataset(1 to 10)
2.
val ds2: Dataset[(String, Int, String)] = spark.createDataset(List(("zs",5,"男"),("lisi",6,"女")))
更改字段名
val ds2b: DataFrame = ds2.withColumnRenamed("_1","name")
.withColumnRenamed("_2","age")
.withColumnRenamed("_3","gender")
3.
val rdd: RDD[(String, Int, Int, Int)] = sc.parallelize(List(("liuwei",23,170,190),("xupeng",24,180,170)))
val ds3: Dataset[(String, Int, Int, Int)] = spark.createDataset(rdd)
4.样例类:
定义两个case类
case class Point(label:String,x:Double,y:Double)
case class Category(id:Long,name:String)
然后调用
val points: Seq[Point] = Seq(Point("nanjing",23.1,47.2),Point("beijing",23.1,120.3),Point("xian",80.2,60.2))
val pointsDS: Dataset[Point] = points.toDS()
val categories = Seq(Category(1,"beijing"),Category(2,"nanjing"))
val categoryDS: Dataset[Category] = categories.toDS()
此时可以将两表关联
val joindf: DataFrame = pointsDS.join(categoryDS,pointsDS("label")===categoryDS("name"))
joindf.show()