package spark
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}import org.apache.spark.{SparkConf, SparkContext}
object Spark1SQL {
def main(args: Array[String]): Unit ={
val conf =newSparkConf().setAppName("Spark1SQL").setMaster("local[2]")
val sc =newSparkContext(conf)
val sqlContext =newSQLContext(sc)//sparksql增强
val lines = sc.textFile("D:\\code\\ip\\student.txt")
val rowRDD: RDD[Row]= lines.map(line =>{
val fields = line.split(",")
val id =fields(0).toLong
val name =fields(1)
val age =fields(2).toInt
val score =fields(3).toDouble
Row(id, name, age, score)})
val sch: StructType =StructType(List(//结果类型,其实就是表头,用于描述DataFrameStructField("id", LongType,true),StructField("name", StringType,true),StructField("age", IntegerType,true),StructField("score", DoubleType,true)))
val bdf: DataFrame = sqlContext.createDataFrame(rowRDD, sch)//创建数据框
bdf.registerTempTable("t_boy")//把DataFrame先注册临时表
val result: DataFrame = sqlContext.sql("SELECT * FROM t_boy ORDER BY score desc, age asc")//书写SQL(SQL方法应其实是Transformation)
result.show()//查看结果(触发Action)
sc.stop()}}
package spark
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}import org.apache.spark.{SparkConf, SparkContext}
object Spark1DF {
def main(args: Array[String]): Unit ={
val conf =newSparkConf().setAppName("Spark1DF").setMaster("local[2]")
val sc =newSparkContext(conf)
val sqlContext =newSQLContext(sc)
val lines = sc.textFile("D:\\code\\ip\\student.txt")
val rowRDD: RDD[Row]= lines.map(line =>{
val fields = line.split(",")
val id =fields(0).toLong
val name =fields(1)
val age =fields(2).toInt
val score =fields(3).toDouble
Row(id, name, age, score)})
val sch: StructType =StructType(List(StructField("id", LongType,true),StructField("name", StringType,true),StructField("age", IntegerType,true),StructField("score", DoubleType,true)))
val bdf: DataFrame = sqlContext.createDataFrame(rowRDD, sch)
val df1: DataFrame = bdf.select("name","age","score")import sqlContext.implicits._ //使用dataframe的API
val df2: DataFrame = df1.orderBy($"score" desc, $"age" asc)
df2.show()
sc.stop()}}
package spark
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object Spark2SQL {
def main(args: Array[String]): Unit ={
val spark = SparkSession.builder().appName("Spark2SQL").master("local[*]").getOrCreate()
val lines: Dataset[String]= spark.read.textFile("D:\\code\\ip\\words.txt")//Dataset分布式数据集,是对RDD的进一步封装,是更加智能的RDD,只有一列,默认这列叫valueimport spark.implicits._
val words: Dataset[String]= lines.flatMap(_.split(" "))//由于隐式转换的作用直接可以读成DF
words.createTempView("v_wc")//注册视图
val result: DataFrame = spark.sql("SELECT value word, COUNT(*) counts FROM v_wc GROUP BY word ORDER BY counts DESC")
result.show()
spark.stop()}}
package spark
import org.apache.spark.sql.{Dataset, SparkSession}
object Spark2DF {
def main(args: Array[String]): Unit ={
val spark = SparkSession.builder().appName("Spark2DF").master("local[*]").getOrCreate()
val lines: Dataset[String]= spark.read.textFile("D:\\code\\ip\\words.txt")import spark.implicits._
val words: Dataset[String]= lines.flatMap(_.split(" "))// val r = words.groupBy($"value" as "word").count().count//使用DataSet的API(DSL)//导入聚合函数import org.apache.spark.sql.functions._
val counts = words.groupBy($"value".as("word")).agg(count("*") as "counts").orderBy($"counts" desc)
counts.show()// println(r)
spark.stop()}}