- Spark session available as 'spark'.
- Welcome to Spark version 2.0.1
-
- Using Scala version 2.11.8
-
- import org.apache.spark.sql.SparkSession
- import org.apache.spark.sql.DataFrame
- import org.apache.spark.rdd.RDD
- import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
- import org.apache.spark.sql.Encoder
-
- scala>
-
- scala> val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
- 16/11/05 15:40:31 WARN SparkSession$Builder: Use an existing SparkSession, some configuration may not take effect.
- spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@68d97cfb
-
- scala>
-
- scala> // For implicit conversions like converting RDDs to DataFrames
- scala> import spark.implicits._
-
-
- scala>
-
- scala> // 创建数据框
-
- scala> // val data1:DataFrame=spark.read.csv("hdfs://ns1/datafile/wangxiao/Affairs.csv")
-
- scala>
-
- scala> val data1: DataFrame = spark.read.option("header", true).format("csv").load("hdfs://ns1/datafile/wangxiao/Affairs.csv")
- data1: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]
-
- scala>
-
- scala> data1.printSchema()
- root
- |-- affairs: string (nullable = true)
- |-- gender: string (nullable = true)
- |-- age: string (nullable = true)
- |-- yearsmarried: string (nullable = true)
- |-- children: string (nullable = true)
- |-- religiousness: string (nullable = true)
- |-- education: string (nullable = true)
- |-- occupation: string (nullable = true)
- |-- rating: string (nullable = true)
-
-
- scala>
-
- scala> data1.limit(10).show
-
-
-
- scala>
-
- scala> //##############################################
-
- scala> // 转换字符类型
-
- scala> val res1 = data1.select(
- | data1("affairs").cast("Double"),
- | data1("age").cast("Double"),
- | data1("yearsmarried").cast("Double"),
- | data1("religiousness").cast("Double"),
- | data1("education").cast("Double"),
- | data1("occupation").cast("Double"),
- | data1("rating").cast("Double"),
- | data1("gender").cast("String"),
- | data1("children").cast("String"))
- res1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]
-
- scala>
-
- scala> res1.printSchema()
- root
- |-- affairs: double (nullable = true)
- |-- age: double (nullable = true)
- |-- yearsmarried: double (nullable = true)
- |-- religiousness: double (nullable = true)
- |-- education: double (nullable = true)
- |-- occupation: double (nullable = true)
- |-- rating: double (nullable = true)
- |-- gender: string (nullable = true)
- |-- children: string (nullable = true)
-
-
- scala>
-
- scala> //################################################
-
- scala> //创建RDD
-
- scala> val data2: RDD[String] = spark.sparkContext.textFile("hdfs://ns1/datafile/wangxiao/Affairs.txt")
-
- scala>
-
- scala> case class Affairs1(affairs: Int, gender: String, age: Int,
- | yearsmarried: Double, children: String, religiousness: Int,
- | education: Double, occupation: Double, rating: Int)
- defined class Affairs1
-
- scala>
-
- scala> // RDD转换成数据框
-
- scala> val res2 = data2.map { _.split(" ") }.map { line =>
- | Affairs1(line(0).toInt, line(1).trim.toString(), line(2).toInt,
- | line(3).toDouble, line(4).trim.toString(), line(5).toInt,
- | line(6).toDouble, line(7).toDouble, line(8).toInt)
- | }.toDF()
- res2: org.apache.spark.sql.DataFrame = [affairs: int, gender: string ... 7 more fields]
-
- scala>
-
- scala> res2.printSchema()
- root
- |-- affairs: integer (nullable = false)
- |-- gender: string (nullable = true)
- |-- age: integer (nullable = false)
- |-- yearsmarried: double (nullable = false)
- |-- children: string (nullable = true)
- |-- religiousness: integer (nullable = false)
- |-- education: double (nullable = false)
- |-- occupation: double (nullable = false)
- |-- rating: integer (nullable = false)
-
-
- scala>
-
- scala> //###############################################
-
- scala> // 创建视图
-
- scala> res1.createOrReplaceTempView("Affairs")
-
- scala>
-
- scala> // 子查询
-
- scala> //val df1 = spark.sql("SELECT * FROM Affairs WHERE age BETWEEN 20 AND 25")
-
- scala> val df1 = spark.sql("select gender, age,rating from ( SELECT * FROM Affairs WHERE age BETWEEN 20 AND 25 ) t ")
- df1: org.apache.spark.sql.DataFrame = [gender: string, age: double ... 1 more field]
-
- scala>
-
- scala> df1.limit(10).show
- +------+----+------+
- |gender| age|rating|
- +------+----+------+
- | male|22.0| 3.0|
- |female|22.0| 3.0|
- | male|22.0| 5.0|
- |female|22.0| 4.0|
- |female|22.0| 4.0|
- |female|22.0| 5.0|
- |female|22.0| 5.0|
- |female|22.0| 5.0|
- |female|22.0| 5.0|
- |female|22.0| 5.0|
- +------+----+------+
-
-
- scala>
-
- scala> // 保存数据框到文件
-
- scala> data1.select("gender", "age", "education").write.format("csv").save("hdfs://ns1/datafile/wangxiao/data123.csv")
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29070860/viewspace-2127854/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29070860/viewspace-2127854/