import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
object DataFrameCreate{
def main(args:Array[String]) {
val conf = new SparkConf().setAppName("Spark Pi").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext=new SQLContext(sc)
val file = Source.fromFile("D:\\buffer\\data\\train.csv")
val stop = 10
var start = 0
val inputData: ArrayBuffer[Row] =new ArrayBuffer[Row]()
for (line <- file.getLines) {
if (start < stop) {
val lineList = line.trim.split(",")
val newlineArray = new ArrayBuffer[Any]()
newlineArray += lineList(0).trim
for(i <- 1 to lineList.length-1)
{
newlineArray += lineList(i).trim.toDouble
}
inputData += Row(newlineArray: _*)
start += 1
}
}
file.close()
val schemaStringStr = "EventId"
val schemaStringDou = """
DER_ma11_MMC
,DER_ma11_tran1ver1e_met_lep
,DER_ma11_vi1
,DER_pt_h
,DER_deltaeta_jet_jet
,DER_ma11_jet_jet
,DER_prodeta_jet_jet
,DER_deltar_tau_lep
,DER_pt_tot
,DER_1um_pt
,DER_pt_ratio_lep_tau
,DER_met_phi_centrality
,DER_lep_eta_centrality
,PRI_tau_pt
,PRI_tau_eta
,PRI_tau_phi
,PRI_lep_pt
,PRI_lep_eta
,PRI_lep_phi
,PRI_met
,PRI_met_phi
,PRI_met_1umet
,PRI_jet_num
,PRI_jet_leading_pt
,PRI_jet_leading_eta
,PRI_jet_leading_phi
,PRI_jet_1u0leading_pt
,PRI_jet_1u0leading_eta
,PRI_jet_1u0leading_phi
,PRI_jet_all_pt
,Weight
,Label
"""
val schemaStringArray = new ArrayBuffer[StructField]
val stringType = StructField(schemaStringStr,StringType,true)
schemaStringArray+=stringType
var doubleType=StructType(schemaStringDou.split(",").map { x => x.trim;StructField(x,DoubleType,true)})
schemaStringArray ++= doubleType
val schemaType = StructType(schemaStringArray.toList)
val inputRDD = sc.parallelize(inputData)
val df = sqlContext.createDataFrame(inputRDD,schemaType)
val collectall = df.collect()
for(ele<-collectall) println(ele)
}
}