废话不多说,直接上代码
package test
import java.io.File
import java.util.UUID
import com.aa.cphe.helper.FileHelper
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
/**
* Created by Administrator on 2018/11/2.
*/
object ReadExcelTest {
def main(args: Array[String]) {
val file: File = new File("metastore_db")
if(file.exists()) FileHelper.deleteDir(file)
val appName = "ReadExcelTest"
val sparkConf=new SparkConf().setMaster("local")
val spark = SparkSession.builder()
.config(sparkConf)
.appName(appName)
.enableHiveSupport()
.getOrCreate()
/**
* 多列数据合并 *
*/
spark.udf.register("mergeCols",(row: Row)=>{
val separator = ","
row.toSeq.foldLeft("")(_ + separator + _).substring(1)
})
/**
* 四分位数
*/
spark.udf.register("getQuartile",(data:String)=>{
//println(data)
if(data==""){
val re=""
re
}else{
val arr: Array[Double] =data.split(",").map(_.toDouble).sortBy({x:Double => x})
//println("数组长度:"+arr.length)
//arr.map({println(_)})
//开始对数组进行处理,找出四分位数S1,S2,S3,i(n+1)/4,n表示序列中包含的项数,i=1,2,3
if(arr.length>4){
if((arr.length+1)%4==0){
val sl1=1*(arr.length+1)/4
val sl2=2*(arr.length+1)/4
val sl3=3*(arr.length+1)/4
val s1=arr(sl1)
val s2=arr(sl2)
val s3=arr(sl3)
val iqr=s3-s1
val limi=(s1-1.5*iqr)+","+(s3+1.5*iqr)
//println(limi)
limi
}else{
val sl1=1*(arr.length.toFloat+1)/4
val sl2=2*(arr.length.toFloat+1)/4
val sl3=3*(arr.length.toFloat+1)/4
val s1=arr((Math.floor(sl1)).toInt-1)*(sl1-Math.floor(sl1))+arr((Math.ceil(sl1)).toInt-1)*(Math.ceil(sl1)-sl1)
val s2=arr((Math.floor(sl2)).toInt-1)*(sl2-Math.floor(sl2))+arr((Math.ceil(sl2)).toInt-1)*(Math.ceil(sl2)-sl2)
val s3=arr((Math.floor(sl3)).toInt-1)*(sl3-Math.floor(sl3))+arr((Math.ceil(sl3)).toInt-1)*(Math.ceil(sl3)-sl3)
val iqr=s3-s1
val limi=(s1-1.5*iqr)+","+(s3+1.5*iqr)
limi
}
}else{
""
}
}
})
/**
* 异常判断
* 是”-1,“否”-0
*/
spark.udf.register("execCompare",(_c9: String,min:String,max:String)=>{
if(_c9==null||_c9==""||min==null||min==""||max==null||max==""){
""
}else{
if(_c9.toDouble<min.toDouble||_c9.toDouble>max.toDouble){
"1"
}else{
"0"
}
}
})
/**
* 主键标识
*/
spark.udf.register("getUU",() => {
UUID.randomUUID().toString
})
//val path="/C:\\Users\\Administrator\\Desktop\\12.csv";
val path="hdfs://master:8020/tmp/niuTest/12.csv"
println("数据读取测试,文件为:"+path)
readXlsx(path,spark)
}
def readXlsx(path : String,spark:SparkSession) : DataFrame = {
val frame: DataFrame =spark.read.format("com.databricks.spark.csv").option("header", "false").load(path)
val header: Row =frame.first()
val rows =frame.filter(line => line(1) != header(1)).toDF().cache()
rows.show(false)
//val df = rows.toDF(header.get(0).toString,header.get(1).toString)//将第一行处理为需要制定的列名
val separator = ","
val df=rows.groupBy("_c1").agg(
collect_list("_c9").alias("CONTEN")
).cache()
println("用户数:"+df.count())
val dff = df.selectExpr("_c1","concat_ws(',',CONTEN ) as RE")
val df1=dff.selectExpr("_c1","getQuartile(RE) as QU")//将结果处理成字符串曲线处理
//val df1=df.selectExpr("_c1","getQuartile1(CONTEN) as QU")//直接对数据进行处理
val df3=rows.join(df1,Seq("_c1"),"left").selectExpr("getUU() as id","_c1","_c2","_c3","_c4","_c5","_c6","_c7","_c8","_c9","split(QU,',')[0] as MIN","split(QU,',')[1] as MAX")
.selectExpr("id","_c1","_c2","_c3","_c4","_c5","_c6","_c7","_c8","_c9","execCompare(_c9,MIN,MAX) as _c10")
df3.show(false)
//val hdfs:String ="d://mydata"
val hdfs:String="hdfs://master:8020/tmp/niuTest"
//删除已经上次的文件
//deleteDir(new File(hdfs))
//写数据到ccsv文件中
df3.repartition(1).write.mode(SaveMode.Append)
.format("com.databricks.spark.csv")
.options(Map("header"-> "true","delimiter"->","))//在csv第一行有属性"true",没有就是"false"//默认以","分割
.save(hdfs)
println("结果条数:"+df3.count())
df3
}
/**
* 删除文件
* @param file
*/
def deleteDir(file:java.io.File): Unit = {
if (file == null) {
} else {
val files = file.listFiles().filter(!_.isDirectory)
.filter(t => t.toString.endsWith(".csv") || t.toString.endsWith(".csv.crc")) //此处读取.txt and .md文件
files ++ file.listFiles().filter(_.isDirectory).flatMap(getFile)
for (fe <- files) {
fe.delete()
}
}
}
/**
* 获取指定后缀的文件路径
* @param file
* @return
*/
def getFile(file:java.io.File): Array[java.io.File] ={
val files = file.listFiles().filter(! _.isDirectory)
.filter(t => t.toString.endsWith(".csv")|| t.toString.endsWith(".csv.crc")) //此处读取.txt and .md文件
files ++ file.listFiles().filter(_.isDirectory).flatMap(getFile)
files
}
}
工具类
package com.aa.cphe.helper
import java.io.File
/**
* Created by zhangjiajie on 2018/8/7.
*/
object FileHelper {
def deleteDir(dir:File): Unit ={
val files: Array[File] = dir.listFiles()
files.foreach(f=>{
if(f.isDirectory){
deleteDir(f)
}else{
f.delete()
}
})
dir.delete()
}
}