2.5.3 三者的互相转换
-- 1. RDD <=> DF
a、RDD --> DF
"rdd.toDF("列名1","列名2",...)"
b、DF --> RDD
"df.rdd"
-- 2. RDD <=> DS
a、 RDD => DS
将rdd的数据转换为样例类的格式。
"rdd.toDS"
b、 DS => RDD
"ds.rdd"
-- 3. DF <=> DS
a、DF => DS
"df.as[样例类]",该样例类必须存在,而且df中的数据个样例类对应
b、 DS => DS
"ds.toDF"
-- 说明:
a、通过DF转换得来的RDD的数据类型是ROW。
b、通过DS转换得来的RDD的数据类型和DS的数据类型一致
c、RDD:只关心数据本身
DataFrame:关心数据的结构
DataSet:关心数据类型
生成模拟数据
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object SparkSQL16_Req_Mock {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "root")
// TODO SparkSQL
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SQL")
// TODO 操作Hive时,启用Hive支持
val spark : SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
// TODO 向Hive中建表表插入数据
// Scala中如果写sql,使用双引号时比较不方便,所以一般采用多行字符串
spark.sql("use atguigu200317")
spark.sql(
"""
|CREATE TABLE `user_visit_action`(
| `date` string,
| `user_id` bigint,
| `session_id` string,
| `page_id` bigint,
| `action_time` string,
| `search_keyword` string,
| `click_category_id` bigint,
| `click_product_id` bigint,
| `order_category_ids` string,
| `order_product_ids` string,
| `pay_category_ids` string,
| `pay_product_ids` string,
| `city_id` bigint)
|row format delimited fields terminated by '\t'
""".stripMargin)
spark.sql(
"""
| load data local inpath 'input/user_visit_action.txt' into table atguigu200317.user_visit_action
""".stripMargin)
spark.sql(
"""
|CREATE TABLE `product_info`(
| `product_id` bigint,
| `product_name` string,
| `extend_info` string)
|row format delimited fields terminated by '\t'
""".stripMargin)
spark.sql(
"""
| load data local inpath 'input/product_info.txt' into table atguigu200317.product_info
""".stripMargin)
spark.sql(
"""
|CREATE TABLE `city_info`(
| `city_id` bigint,
| `city_name` string,
| `area` string)
|row format delimited fields terminated by '\t'
""".stripMargin)
spark.sql(
"""
| load data local inpath 'input/city_info.txt'