在《spark高级数据分析》上看到一种不错的对数据进行初步统计分析的方法。在实践后,果然效果不错。在此记录,并分享给大家,勉励自己不断学习新知识。
//判断首行 def isHeader(line:String) = line.contain("id_1") def toDouble(s:String)={ try{ s.toDouble } catch{ case e: Exception => Double.NaN } } case class MatchData(id1:Int, id2:Int, scores:Array[Double],matched:Boolean) def parseData(line:String){ var arr = line.split(",") val id1 = arr(0).toInt val id2 = arr(1).toInt val matched = arr(11).toBoolean //0列为第一列,统计2到11列的数据 val scoses = arr.slice(2,11).map(toDouble) MatchData(id1,id2,scores,matched) } def DataAnalysis(){ import Java.lang.Double.isNan val conf = new SparkConf().setAppName("dataAnalysis").setMaster("local[*]") val sc = new SparkContext(conf) //读取linkage目录下的所有文件 val filePath = "linkage/*" val textContext = sc.textFile(filePath) //去掉首行 val noHeader = textContext.filter{ x => !isHeader(x)} val mds = noHeader.map{ x => parseData(x)} } val statsm = statsWithMissing(mds.filter(_.matched).map(_.scores)) val statsn = statsWithMissing(mds.filter(!_.matched).map(_.scores)) statsm.zip(statsn).map{ case(m,n) => (m.m_lMissing + n.m_lMissing, m.stats.mean - n.stats.mean) }.foreach(println) import org.apache.spark.util.StatCounter class VariableStats extends Serializable{ val stats:StatCounter = new StatCounter() var m_Missing = 0 def add(x:Double):VariableStats={ if(Double.NaN.equals(x)) m_lMissing += 1 else stats.merge(x) this } def merge(other:VariableStats):VariableStats={ stats.merge(other.stats) m_lMissing += other.m_lMissing this } override def toString = { "stats:" + stats.toString() +"NaN" + m_lMissing } } object VariableStats extends Serializable{ def apply(x:Double) = new VariableStats() } import org.apache.spark.rdd.RDD def statsWithMissing(rdd: RDD[Array[Double]]):Array[VariableStats] = { val nastats = rdd.mapPartitions((iter:Iterator[Array[Double]]) => { val nas:Array[VariableStats] = iter.next().map{ d => VariableStats(d) } iter.foreach{ arr => nas.zip(arr).foreach{case(n,d) => n.add(d)}} Iterator(nas) }) nastats.reduce((n1,n2) => { n1.zip(n2).map{case(a,b) => a.merge(b)} }) } def naz(d:Double) = if(isNan(d)) 0 else d case class Scored(md:MatchData,score:Double) def getScores(mds:RDD[MatchData]) = { val ct = mds.map(md =>{ val score = Array(2,5,6,7,8).map(i => naz(md.score(i))).sum }) ct.filter{ s => s.score >= 4}.map{ s => s.md.matched}.countByValue().foreach(println) } def main(args:Array[String]){ DataAnalysis() println("OK") }