Spark DataFrames DataSet

Json文件内容:

{"name":"Michael"}

{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

--  加载json文件转换成DataFrames
scala> val df = sqlContext.jsonFile("/spark/json")
warning: there were 1 deprecation warning(s); re-run with -deprecation for details
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]          
// 显示
scala> df.show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+
 -- 打印schema
scala> df.printSchema
root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 -- 按列选择
scala> df.select("age").show
+----+
| age|
+----+
|null|
|  30|
|  19|
+----+
scala> df.select("age","name").show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

 -- 对字段值进行操作 age这一列+2
scala> df.select(df("name"),df("age")+2).show
+-------+---------+
|   name|(age + 2)|
+-------+---------+
|Michael|     null|
|   Andy|       32|
| Justin|       21|
+-------+---------+

 -- filter过滤器
scala> df.filter(df("age")>20).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

-- 分组函数  ---必须要加聚合函数
scala> df.groupBy("age").count().show()
+----+-----+                                                                    
| age|count|
+----+-----+
|null|    1|
|  19|    1|
|  30|    1|
+----+-----+
val sc: SparkContext // An existing SparkContext.
val sqlContext = new org.apache.spark.sql.SQLContext(sc)


// Create the DataFrame
val df = sqlContext.read.json("examples/src/main/resources/people.json")


// Show the content of the DataFrame
df.show()
// age  name
// null Michael
// 30   Andy
// 19   Justin


// Print the schema in a tree format
df.printSchema()
// root
// |-- age: long (nullable = true)
// |-- name: string (nullable = true)


// Select only the "name" column
df.select("name").show()
// name
// Michael
// Andy
// Justin


// Select everybody, but increment the age by 1
df.select(df("name"), df("age") + 1).show()
// name    (age + 1)
// Michael null
// Andy    31
// Justin  20


// Select people older than 21
df.filter(df("age") > 21).show()
// age name
// 30  Andy


// Count people by age
df.groupBy("age").count().show()
// age  count
// null 1
// 19   1
// 30   1

-- 读取hive表中的数据转化成DataFrame
scala> val df = sqlContext.sql("select * from t_hdrc_type_month");
scala> df.count
res6: Long = 121144




-- ***********创建RDD----DF和DS互相转化  -----Creating Datasets*****************/


-- DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name.
// Encoders for most common types are automatically provided by importing sqlContext.implicits._
val ds = Seq(1, 2, 3).toDS()
ds.map(_ + 1).collect() // Returns: Array(2, 3, 4)


// Encoders are also created for case classes.
case class Person(name: String, age: Long)
val ds = Seq(Person("Andy", 32)).toDS()


// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name.
val path = "examples/src/main/resources/people.json"
val people = sqlContext.read.json(path).as[Person]


-- dataFrame转换成DataSet
scala> sqlContext.read.json("hdfs://suixingpay199:9000/user/app/spark/people.json")
res6: org.apache.spark.sql.DataFrame = [age: bigint, name: string]


scala> res6.as[Person]
res8: org.apache.spark.sql.Dataset[Person] = [name: string, age: bigint]




-- 数据源来源于mysql
val jdbcDF = sqlContext.load("jdbc", Map("url" -> "jdbc:mysql://suixingpay190:3306/azkaban?user=root&password=123", "dbtable" -> "projects"))
scala> jdbcDF.show
+---+--------------+------+-------------+-------------+-------+----------------+--------------+--------+--------------------+
| id|          name|active|modified_time|  create_time|version|last_modified_by|   description|enc_type|       settings_blob|
+---+--------------+------+-------------+-------------+-------+----------------+--------------+--------+--------------------+
|  1|          test|  true|1452602214512|1452599858502|      4|         azkaban|          test|       2|[31, -117, 8, 0, ...|
|  2|         touch|  true|1452655867334|1452601523613|      7|         azkaban|         touch|       2|[31, -117, 8, 0, ...|
|  3|DataWherehouse| false|1452688047362|1452662944078|     34|         azkaban|datawherehouse|       2|[31, -117, 8, 0, ...|
|  4|DataWhereHouse|  true|1452689575193|1452688067801|      3|         azkaban|DataWhereHouse|       2|[31, -117, 8, 0, ...|
|  5|  HIveAnalysis|  true|1453440935221|1453440922278|      1|           baige|  HIveAnalysis|       2|[31, -117, 8, 0, ...|
+---+--------------+------+-------------+-------------+-------+----------------+--------------+--------+--------------------+
-- 查询指定列
jdbcDF.select("name","id").show
+--------------+---+
|          name| id|
+--------------+---+
|          test|  1|
|         touch|  2|
|DataWherehouse|  3|
|DataWhereHouse|  4|
|  HIveAnalysis|  5|
+--------------+---+
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值