加载常用外部数据源
maven项目需要添加的依赖:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.1</version>
</dependency>
CSV格式的文件:
csv文件示例:users.csv
user_id,locale,birthyear,gender,joinedAt,location,timezone
3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480
3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420
823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240
1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210
3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420
627175141,ka_GE,1973,female,2012-11-01T09:59:17.590Z,Tbilisi Georgia,240
2752000443,id_ID,1994,male,2012-10-03T05:22:17.637Z,Medan Indonesia,420
3473687777,id_ID,1965,female,2012-10-03T12:19:29.975Z,Medan Indonesia,420
2966052962,id_ID,1979,male,2012-10-31T10:11:57.668Z,Medan Indonesia,420
---------------------- 以 下 省 略 -----------------------------
使用rdd的方式读取文件:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object CSVDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("csvtest")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("in/users.csv")
//删除csv文件的首行,(怎么知道第一行会被分到0号分区的)
val fields: RDD[Array[String]] = lines.mapPartitionsWithIndex((i, v) => {
if (i == 0) v.drop(1)
else v
}).map(x => x.split(","))
println(fields.count())
}
}
使用生成dataframe的方式读取文件:
import org.apache.spark.sql.{DataFrame, SparkSession}
object CSVDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("scvdemo").getOrCreate()
val df: DataFrame = spark.read.format("csv").option("header","true").load("in/users.csv")
df.printSchema()
df.select("user_id","locale","birthyear").show(5)
/* 如果option("header","false")的情况,显示如下:
root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
. . . . . . . . . . .
打印结果首行为
+--------+-------+----------+-------+---------+---------+---------+
| _c0| _c1| _c2| _c3| _c4| _c5| _c6|
+--------+-------+----------+-------+---------+---------+---------+
| user_id| locale| birthyear| gender| joinedAt| location| timezone|
+--------+-------+----------+-------+---------+---------+---------+
select("user_id","locale","birthyear") 查询结果报错:
Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve '`user_id`'
given input columns: [_c3, _c1, _c6, _c0, _c5, _c2, _c4];;
*/
}
}
输出结果:
root
|-- user_id: string (nullable = true)
|-- locale: string (nullable = true)
|-- birthyear: string (nullable = true)
|-- gender: string (nullable = true)
|-- joinedAt: string (nullable = true)
|-- location: string (nullable = true)
|-- timezone: string (nullable = true)
+----------+------+---------+
| user_id|locale|birthyear|
+----------+------+---------+
|3197468391| id_ID| 1993|
|3537982273| id_ID| 1992|
| 823183725| en_US| 1975|
|1872223848| en_US| 1991|
|3429017717| id_ID| 1995|
+----------+------+---------+
only showing top 5 rows
修改列名:
val df1: DataFrame = df.withColumnRenamed("_c0","id")
df1.printSchema()
/*
root
|-- id: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
. . . . . . . . . . .
*/
val df2: DataFrame = df.withColumn("id",df("_c0").cast("long"))
df2.printSchema()
/*
root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
. . . . . .
|-- id: long (nullable = true)
*/
val df3: DataFrame = df.withColumn("id",df.col("_c0").cast("long")).drop("_c0")
df3.printSchema()
/*
root
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
. . . . . .
|-- id: long (nullable = true)
*/
json格式的文件
json文件示例:users.json
{"name":"Michael"}
{"name":"Andy","Age":30}
{"name":"Justin","Age":19}
使用rdd的方式读取json文件:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object JsonDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("json")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("in/users.json")
lines.collect().foreach(println)
/*
{"name":"Michael"}
{"name":"Andy","Age":30}
{"name":"Justin","Age":19}
*/
import scala.util.parsing.json._
val rdd: RDD[Option[Any]] = lines.map(x=>JSON.parseFull(x))
rdd.collect().foreach(println)
/*
Some(Map(name -> Michael))
Some(Map(name -> Andy, Age -> 30.0))
Some(Map(name -> Justin, Age -> 19.0))
*/
}
}
使用生成dataframe的方式读取json文件:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
object JsonDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("json")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val df: DataFrame = spark.read.format("json").option("header",true).load("in/users.json")
df.printSchema()
df.show()
}
}
输出结果:
root
|-- Age: long (nullable = true)
|-- name: string (nullable = true)
+----+-------+
| Age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
修改列名:
//修改列名 withColumnRenamed(existingName: String, newName: String)
val df1: DataFrame = df.withColumnRenamed("Age","age")
df1.printSchema()
//修改列数据类型
val df2: DataFrame = df1.withColumn("age",df1.col("age").cast("int"))
df2.printSchema()
输出结果:
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
root
|-- age: integer (nullable = true)
|-- name: string (nullable = true)
2020.11.16补充:
sparkSQL操作外部数据源——Parquet文件
sparkSQL写parquet文件:
package nj.zb.kb09.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types._
object ParDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("parquetdemo").getOrCreate()
val sc: SparkContext = spark.sparkContext
val list = List(
("zhangsan", "red", Array(3, 4, 5)),
("lisi", "black", Array(12, 14, 15)),
("wangwu", "orange", Array(23, 64, 15)),
("zhaoliu", "red", Array(13, 84, 15))
)
val rdd1: RDD[(String, String, Array[Int])] = sc.parallelize(list)
//通过StructType直接指定Schema
val schema = StructType(
Array(
StructField("name", StringType),
StructField("color", StringType),
StructField("numbers", ArrayType(IntegerType))
)
)
val rowRDD: RDD[Row] = rdd1.map(x=>Row(x._1,x._2,x._3))
val df: DataFrame = spark.createDataFrame(rowRDD,schema)
df.show()
//在该目录下生成parquet文件
df.write.parquet("out/color")
}
}
生成的文件:
读取文件:
val frame: DataFrame = spark.read.parquet("out/color")
frame.printSchema()
frame.show()
输出结果:
root
|-- name: string (nullable = true)
|-- color: string (nullable = true)
|-- numbers: array (nullable = true)
| |-- element: integer (containsNull = true)
+--------+------+------------+
| name| color| numbers|
+--------+------+------------+
| wangwu|orange|[23, 64, 15]|
|zhangsan| red| [3, 4, 5]|
| zhaoliu| red|[13, 84, 15]|
| lisi| black|[12, 14, 15]|
+--------+------+------------+