1,CSV
object CreateDataFrameFromCSV {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
//获取数据的schema信息,每一行都有读取,将描述信息返回Driver端
// val schema = new StructType()
// .add("name", DataTypes.StringType)
// .add("age", DataTypes.IntegerType)
// .add("fv", DataTypes.DoubleType)
//
//
// val df: DataFrame = spark
// .read
// .schema(schema)
// //.option("inferSchema", "true") //推断数据类型
// .csv("data/user.csv")
//
// //val df2 = df.toDF("name", "age", "fv")
val df = spark.read
.option("header", true) //读取头信息当做列
.option("inferSchema", "true")
.csv("data/user.csv")
df.printSchema()
df.write.csv("out/csv")
spark.stop()
}
}
2,JDBC
import java.util.Properties
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
/**
* 通过JDBC的方式创建DataFrame
*/
object CreateDataFrameFromJDBC {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val props = new Properties()
props.setProperty("driver", "com.mysql.jdbc.Driver")
props.setProperty("user", "root")
props.setProperty("password", "123456")
val url = "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8"
//因为数据库的表有schema,
//Driver访问了数据库,获取到了表的Schema信息
val df: DataFrame = spark.read.jdbc(url, "t_boy", props)
df.printSchema()
//df.show()
//将数据写回到MySQL
df.where("fv >= 100").write.mode(SaveMode.Append).jdbc(url, "t_person", props)
//Thread.sleep(100000000)
}
}
3,JSON
object CreateDataFrameFromJSON {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
//获取数据的schema信息,每一行都有读取,将描述信息返回Driver端了
//val df: DataFrame = spark.read.json("data/user.json")
val df: DataFrame = spark.read.format("json").load("data/user.json")
df.printSchema()
import spark.implicits._
//df.where($"_corrupt_record".isNull).show()
val df2: DataFrame = df.filter(row => {
row.getString(0) == null
}).select("name", "age", "fv", "gender")
//SaveMode.ErrorIfExists有目录就保存
//SaveMode.Ignore有数据不写入其不报错
//SaveMode.Overwrite将原来的数据删掉在写入
//SaveMode.Append将新的数据追加
//df2.write.mode(SaveMode.Append).json("out/boy")
df2.write.mode(SaveMode.Append).format("json").save("out2/boy")
spark.stop()
}
}
4,Parquet
object CreateDataFrameFromParquet {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df: DataFrame = spark.read.parquet("out/par")
//df.printSchema()
df.select("age").show()
spark.stop()
}
}
5,Orc
object CreateDataFrameFromOrc {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df: DataFrame = spark.read.orc("out/orc")
df.printSchema()
df.select("age").show()
spark.stop()
}
}