目录
装载CSV数据源
import org.apache.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql._
import org.apache.spark.{SparkConf, SparkContext}
object ReadCsvDemo {
def main(args: Array[String]): Unit = {
val conf:SparkConf=new SparkConf().setMaster("local[*]").setAppName("cacheDemo")
val sc:SparkContext=SparkContext.getOrCreate(conf)
// val rdd: RDD[String] = sc.textFile("in/users.csv")
// rdd.collect().foreach(println)
// println(rdd.count()) //38210
//
// val rdd2: RDD[String] = rdd.filter(x=>x.startsWith("user_id")==false)
// rdd2.collect().foreach(println)
// println(rdd2.count())
// val rdd3: RDD[Array[String]] = rdd2.map(x=>x.split(","))
// rdd3.collect().foreach(x=>println(x.toList))
//去掉头标
// val rdd1: RDD[String] = rdd.mapPartitionsWithIndex((index, value) => {
// if (index == 0)
// value.drop(1)
// else
// value
// })
// println(rdd1.count())
// println("tag: ",rdd1.filter(x => x.startsWith("user_id") == true).count())
val spark:SparkSession = SparkSession.builder().appName("ReadCsvSparkSession").master("local[*]").getOrCreate()
val df: DataFrame =spark.read.format("csv").option("header","true").load("in/users.csv")
df.printSchema()
df.show(numRows = 10)
val df2: DataFrame = df.select("user_id","birthyear")
df2.show()
df2.printSchema()
val column:Column = df2("birthyear")
val df3: DataFrame = df2.withColumn("birthyear",column.cast(IntegerType)) //新增一列
println("------------------------------")
df3.printSchema()
df3.show()
val column1:Column=df3("birthyear")<=1995
// val ds: Dataset[Row] = df3.filter(df3("birthyear")<=1995)
val ds: Dataset[Row] = df3.filter(x=>{! x.isNullAt(1) && x.getInt(1)<1995})
println("------------ ------------------")
ds.printSchema()
// ds.show()
val df4: DataFrame = ds.withColumnRenamed("user_id","userid")
df4.printSchema()
装载JSON数据源
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, IntegerType}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
object ReadJsonDemo {
def main(args: Array[String]): Unit = {
// val conf:SparkConf=new SparkConf().setMaster("local[*]").setAppName("cacheDemo")
// val sc:SparkContext=SparkContext.getOrCreate(conf)
val spark:SparkSession = SparkSession.builder().appName("ReadCsvSparkSession").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
//
// val rdd: RDD[String] = sc.textFile("in/users.json")
//
// import scala.util.parsing.json.JSON
//
// val rdd2: RDD[Option[Any]] = rdd.map(x=>JSON.parseFull(x)) //json转换成 Option[Any]
//
// rdd.collect().foreach(println)
// rdd2.collect().foreach(println)
val df: DataFrame = spark.read.format("json").option("head","false").load("in/user.json")
df.printSchema()
df.show()
val frame: DataFrame = df.withColumn("ida", df("id").cast(IntegerType)).withColumnRenamed("name","uname")
frame.printSchema()
frame.show()
// val f2: DataFrame = frame.select("ida","uname","age")
// val frame3: DataFrame = f2.withColumn("ageinc",(f2("age")*2).cast(DoubleType)) //新加一列
// frame3.printSchema()
// frame3.show()
// import spark.implicits._
// val frame1: DataFrame = frame.select("ida", "uname", "age").withColumn("ageinc",$"age"* 2).cast(DoubleType)
// frame1.printSchema()
// frame1.show()
年龄统计UserAge实例
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object UserAge {
def main(args: Array[String]): Unit = {
val conf:SparkConf=new SparkConf().setMaster("local[*]").setAppName("sparkDemo")
val sc:SparkContext=SparkContext.getOrCreate(conf)
val userage: RDD[String] = sc.textFile("in/userage.txt")
// val age2: RDD[Int] = userage.map(x=>{
// val strings: Array[String] = x.split(" ")
// strings(1).toInt
// })
//
// val num:Long=age2.count()
//
// val sumAge:Int=age2.reduce(_+_)
//
// val result:Double=sumAge.toDouble/num
// println(result)
val rdd: RDD[(Int, Int)] = userage.map(x => {
(x.split(" ")(1).toInt, 1)
})
val tuple: (Int, Int) = rdd.reduce((x, y)=>{(x._1+y._1,x._2+y._2)})
println("avgage :" + tuple._1.toDouble /tuple._2 )
val tuple2 = rdd.fold((0,0))((x,y)=>{(x._1+y._1,x._2+y._2)})
println("avgage2 :" + tuple2._1.toDouble /tuple2._2 )
val combRdd: RDD[(Int, (Int, Int))] = rdd.map(x => x.swap).combineByKey(x => (x, 1), (x: (Int, Int), y: Int)
=> {
(x._1 + y, x._2 + 1)
}, (x: (Int, Int), y: (Int, Int)) => {
(x._1 + y._1, x._2 + y._2)
})
combRdd.foreach(println)
val rdd2 = combRdd.map(x=>("age :",x._2._1.toDouble/x._2._2))
rdd2.foreach(println)