2020.11.12课堂笔记(加载常用外部数据源)

加载常用外部数据源

maven项目需要添加的依赖:

<dependency>
  <groupId>org.apache.spark</groupId>
  <artifactId>spark-core_2.11</artifactId>
  <version>2.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
  <groupId>org.apache.spark</groupId>
  <artifactId>spark-sql_2.11</artifactId>
  <version>2.1.1</version>
</dependency>

CSV格式的文件:

csv文件示例:users.csv

user_id,locale,birthyear,gender,joinedAt,location,timezone
3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan  Indonesia,480
3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan  Indonesia,420
823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford  Ontario,-240
1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran  Iran,210
3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420
627175141,ka_GE,1973,female,2012-11-01T09:59:17.590Z,Tbilisi  Georgia,240
2752000443,id_ID,1994,male,2012-10-03T05:22:17.637Z,Medan  Indonesia,420
3473687777,id_ID,1965,female,2012-10-03T12:19:29.975Z,Medan  Indonesia,420
2966052962,id_ID,1979,male,2012-10-31T10:11:57.668Z,Medan  Indonesia,420
---------------------- 以 下 省 略 -----------------------------
使用rdd的方式读取文件:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object CSVDemo {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("csvtest")
    val sc = new SparkContext(conf)
    val lines: RDD[String] = sc.textFile("in/users.csv")
    //删除csv文件的首行,(怎么知道第一行会被分到0号分区的)
    val fields: RDD[Array[String]] = lines.mapPartitionsWithIndex((i, v) => {
      if (i == 0) v.drop(1)
      else v
    }).map(x => x.split(","))
    println(fields.count())
  }
}
使用生成dataframe的方式读取文件:
import org.apache.spark.sql.{DataFrame, SparkSession}

object CSVDemo {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("scvdemo").getOrCreate()
    val df: DataFrame = spark.read.format("csv").option("header","true").load("in/users.csv")
    df.printSchema()
    df.select("user_id","locale","birthyear").show(5)
    /*  如果option("header","false")的情况,显示如下:
        root
       |-- _c0: string (nullable = true)
       |-- _c1: string (nullable = true)
       |-- _c2: string (nullable = true)
			. . . . . . . . . . . 
		打印结果首行为
		+--------+-------+----------+-------+---------+---------+---------+
		|     _c0|    _c1|       _c2|    _c3|      _c4|      _c5|      _c6|
		+--------+-------+----------+-------+---------+---------+---------+
		| user_id| locale| birthyear| gender| joinedAt| location| timezone|
		+--------+-------+----------+-------+---------+---------+---------+
		select("user_id","locale","birthyear")  查询结果报错:
	    Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve '`user_id`' 
	    given input columns: [_c3, _c1, _c6, _c0, _c5, _c2, _c4];;
    */
  }
}

输出结果:

root
 |-- user_id: string (nullable = true)
 |-- locale: string (nullable = true)
 |-- birthyear: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- joinedAt: string (nullable = true)
 |-- location: string (nullable = true)
 |-- timezone: string (nullable = true)

+----------+------+---------+
|   user_id|locale|birthyear|
+----------+------+---------+
|3197468391| id_ID|     1993|
|3537982273| id_ID|     1992|
| 823183725| en_US|     1975|
|1872223848| en_US|     1991|
|3429017717| id_ID|     1995|
+----------+------+---------+
only showing top 5 rows
修改列名:
val df1: DataFrame = df.withColumnRenamed("_c0","id")
df1.printSchema()
/*
root
 |-- id: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true) 
      . . . . . . . . . . .
*/
val df2: DataFrame = df.withColumn("id",df("_c0").cast("long"))
df2.printSchema()
/*
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
      . . . . . . 
 |-- id: long (nullable = true)
*/
val df3: DataFrame = df.withColumn("id",df.col("_c0").cast("long")).drop("_c0")
df3.printSchema()
/*
root
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
  . . . . . . 
  |-- id: long (nullable = true)
*/

json格式的文件

json文件示例:users.json

{"name":"Michael"}
{"name":"Andy","Age":30}
{"name":"Justin","Age":19}
使用rdd的方式读取json文件:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object JsonDemo {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("json")
    val sc = new SparkContext(conf)
    val lines: RDD[String] = sc.textFile("in/users.json")
    lines.collect().foreach(println)
    /*
    {"name":"Michael"}
	{"name":"Andy","Age":30}
	{"name":"Justin","Age":19}
    */
    import scala.util.parsing.json._
    val rdd: RDD[Option[Any]] = lines.map(x=>JSON.parseFull(x))
    rdd.collect().foreach(println)
    /*
    Some(Map(name -> Michael))
	Some(Map(name -> Andy, Age -> 30.0))
	Some(Map(name -> Justin, Age -> 19.0))
    */
  }
}
使用生成dataframe的方式读取json文件:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}

object JsonDemo {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("json")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val df: DataFrame = spark.read.format("json").option("header",true).load("in/users.json")
    df.printSchema()
    df.show()
  }
}

输出结果:

root
 |-- Age: long (nullable = true)
 |-- name: string (nullable = true)

+----+-------+
| Age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

修改列名:

//修改列名  withColumnRenamed(existingName: String, newName: String)
val df1: DataFrame = df.withColumnRenamed("Age","age")
df1.printSchema()
//修改列数据类型
val df2: DataFrame = df1.withColumn("age",df1.col("age").cast("int"))
df2.printSchema()

输出结果:

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)

2020.11.16补充:

sparkSQL操作外部数据源——Parquet文件

sparkSQL写parquet文件:

package nj.zb.kb09.sql

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types._

object ParDemo {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("parquetdemo").getOrCreate()
    val sc: SparkContext = spark.sparkContext

    val list = List(
      ("zhangsan", "red", Array(3, 4, 5)),
      ("lisi", "black", Array(12, 14, 15)),
      ("wangwu", "orange", Array(23, 64, 15)),
      ("zhaoliu", "red", Array(13, 84, 15))
    )
    val rdd1: RDD[(String, String, Array[Int])] = sc.parallelize(list)
    //通过StructType直接指定Schema
    val schema = StructType(
      Array(
        StructField("name", StringType),
        StructField("color", StringType),
        StructField("numbers", ArrayType(IntegerType))
      )
    )
    val rowRDD: RDD[Row] = rdd1.map(x=>Row(x._1,x._2,x._3))
    val df: DataFrame = spark.createDataFrame(rowRDD,schema)
    df.show()
    //在该目录下生成parquet文件
    df.write.parquet("out/color")
  }
}

生成的文件:

在这里插入图片描述读取文件:

val frame: DataFrame = spark.read.parquet("out/color")
frame.printSchema()
frame.show()

输出结果:

root
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)

+--------+------+------------+
|    name| color|     numbers|
+--------+------+------------+
|  wangwu|orange|[23, 64, 15]|
|zhangsan|   red|   [3, 4, 5]|
| zhaoliu|   red|[13, 84, 15]|
|    lisi| black|[12, 14, 15]|
+--------+------+------------+
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值