SparkSql数据源
package doc.df
import java.util.Properties
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.spark.sql.SparkSession
/**
* @Program: doc.df
* @Author: huangwei
* @Date: 2019/9/17 10:00
* @description: 数据源
*/
object DataSource {
case class Perple(name:String,age:Long)
class myOutput extends MultipleTextOutputFormat[Any,Any]{
// 重写generateFileNmaeKeyValue 该方法是负责自定义生成文件的文件名
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
val fileName = key.asInstanceOf[String] + ".txt"
fileName
}
override def generateActualKey(key: Any, value: Any): Any = {
null
}
override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = {
var outDir:Path = FileOutputFormat.getOutputPath(job)
if (outDir != null){
val fs:FileSystem = ignored
outDir = fs.makeQualified(outDir)
FileOutputFormat.setOutputPath(job,outDir)
}
}
}
// 基本的几种数据源
private def baseDataSource(spark:SparkSession): Unit ={
// 1、读取parqut文件 Parqut是一种列式存储,写Parqut文件时,所有列都会自动地转化为nullable,以便向后兼容
val userDF = spark.read.load("src/main/resources/users.parquet")
userDF.show()
// +------+--------------+----------------+
// | name|favorite_color|favorite_numbers|
// +------+--------------+----------------+
// |Alyssa| null| [3, 9, 15, 20]|
// | Ben| red| []|
// +------+--------------+----------------+
// userDF.select("name","favorite_color").write.save("src/main/resources/namesAndFavColors.parquet")
// val nameAndColorDF = spark.read.load("src/main/resources/namesAndFavColors.parquet") // spark输出会生成一个文件目录,如果需要生成指定的文件名,需要自定义输出
// nameAndColorDF.show()
// +------+--------------+
// | name|favo