基于spark的版位价格预测，广告计算

最新推荐文章于 2024-04-05 00:00:37 发布
旭旭_哥
最新推荐文章于 2024-04-05 00:00:37 发布
阅读量1.9k
点赞数
分类专栏： apache spark 机器学习数据挖掘
本文链接：https://blog.csdn.net/luoyexuge/article/details/51262674
版权
机器学习同时被 3 个专栏收录
114 篇文章 7 订阅
订阅专栏
数据挖掘
18 篇文章 0 订阅
订阅专栏
apache spark
10 篇文章 0 订阅
订阅专栏
在广告计算行业，经常要用到版位价格预测，这不正好凑巧做了一个，模型基本思路是，选用的是随机森林模型，模型在spark里面训练好，存储在redis中，然后在别的地方调用，看看主程序吧，csdn没有scala的语言关键字，只能用python了，具体看看代码吧：
package com.iclick.spark.buzzads.stats

/**
 * @author wilson.zhou
 */
import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.sql.SQLContext
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.rdd.RDD
import com.buzzinate.common.util.ip.IPUtils
import com.iclick.spark.buzzads.udf.ParseWinnerLog
import com.iclick.spark.buzzads.udf.GetNumArrayStatInfo
import org.apache.commons.io.FileUtils
import java.io.File
import com.iclick.spark.buzzads.model.JSlotInfo
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
import com.alibaba.fastjson.JSON
import java.util.Calendar
import java.text.SimpleDateFormat
import java.util.Date
import com.iclick.spark.common.util.Loggable
import com.iclick.spark.common.util.PathUtil
import com.iclick.spark.common.util.RedisClient
import com.iclick.spark.common.util.Config
import com.iclick.spark.common.util.ReadHdfsFileFunctions._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.stat.Statistics
import com.alibaba.fastjson.JSON
import scala.collection.mutable.ArrayBuffer
import com.iclick.spark.buzzads.model.TagidPriceModel
import com.iclick.spark.buzzads.util.Base64
import java.io.ObjectOutputStream
import java.io.FileOutputStream
import java.io.ObjectInputStream
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.util.MLUtils


object PricePredition {
  def main(args: Array[String]): Unit = {
    println("hello world")
    //关闭一些不必要的日志
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

        if (args.length < 2) {
          System.err.println("Usage: days date")
          System.exit(1)
        }
        val days = Integer.parseInt(args(0))
        val date = args(1)
        val maxDepth_int=args(2).toInt
        
        val paths = PathUtil.getPaths("/rawdata/logcompress/rtbWinner/", days, "ds=", date) //"/Users/Andy/data/rtbreq_json/part-00006"
    
        val conf = new SparkConf()
        if (System.getProperty("local") != null) {
          conf.setMaster("local").setAppName("tagid price prediction")
        }
        

//    val paths = "d:\\wilson.zhou\\Desktop\\e-PDF To Word Converter V2.5汉化破解版\\winnerrtb"
//    val conf = new SparkConf().setAppName("AnalyzeSlotPrice").setMaster("local[4]"); val maxDepth_int =15

    val start = System.currentTimeMillis()
    val sdf = new java.text.SimpleDateFormat("yyyy-MM-dd:HH:mm:ss")
    val sc = new SparkContext(conf)
    val jsonRdd = sc.bytesJson(paths)
    
    //获取版位、省份、os、小时、价格、browser情况
    val kvRdd_temp = jsonRdd.map { json =>
      try {

        val winner = ParseWinnerLog.parse(json)
        val price = winner.getOrElse("price", 0.0).toString.toDouble
        val tagid = winner.getOrElse("bid", "_test").toString.trim.split("_")(0) + winner.getOrElse("tagid", "?").toString
        val os = winner.getOrElse("os", "?").toString
        val browser = winner.getOrElse("browser", "?").toString
        val province = winner.getOrElse("province", "?").toString
        val hour = winner.getOrElse("hour", "?").toString
        (price, tagid, os, browser, province, hour)
      } catch {
        case e: Exception =>
          println(json); val (price, tagid, os, browser, province, hour) = (0.0, "?", "?", "?", "?", "?"); (price, tagid, os, browser, province, hour)
      }
    }.filter(x => x._1 != 0.0 && x._2 != "?" && x._3 != "?" && x._4 != "?" && x._5 != "?" && x._6 != "?")
//  .filter(x => x._1 != 0.0 && x._2 != "?" && x._3 != "?" && x._4 != "?" && x._5 != "?" && x._6 != "?")
    println("winner data has getted")
   
    
    
    //过滤掉版本流量较少的版本

    val kvRdd_temp1 = kvRdd_temp.groupBy((f: (Double, String, String, String, String, String)) => (f._2, f._5, f._6)).map {
      item =>
        val size = item._2.size
        (item._2, size)
    }.filter(x => x._2 > 10)

    val kvRdd_filter_outline_befor = kvRdd_temp1.flatMap(x => x._1)

    //对版位的价格进行预处理，过滤掉一些价格异常的情况，极大情况，用正态分布情况，过滤掉
//    kvRdd_filter_outline_befor.take(500).foreach(println)
    val sum = kvRdd_filter_outline_befor.map(_._1).reduce(_ + _)
    val count = kvRdd_filter_outline_befor.map(_._1).count()
    val mean = sum * 1.0 / count

    val var_n = kvRdd_filter_outline_befor.map(_._1).map(x => Math.pow((x - mean), 2)).reduce(_ + _)
    val var_squrt = Math.sqrt(var_n / count)

    //        .filter(x=>x._1>mean-3*var_squrt && x._1<=mean+3*var_squrt )
//    val kvRdd = kvRdd_filter_outline_befor.filter(x => x._1 > mean - 2 * var_squrt && x._1 <= mean + 4 * var_squrt)
     val kvRdd = kvRdd_filter_outline_befor.filter(x => x._1 <= mean + 3 * var_squrt)
    println("before filter count")
    println(kvRdd_filter_outline_befor.count())
    println("after  filter count")
    println(kvRdd.count())

    println(kvRdd.getClass)
//    kvRdd.take(5).foreach(println)
    //转换为哑变量的形式
    val category_tagid = kvRdd.map(r => r._2.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String, java.lang.Integer]]
    val category_browser = kvRdd.map(r => r._4.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
    val category_os = kvRdd.map(r => r._3.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
    val category_province = kvRdd.map(r => r._5.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
    val category_hour = kvRdd.map(r => r._6.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
//    println(category_browser)    
    val data = kvRdd.map { point =>
      try{
      val label = point._1
      var  arr=ArrayBuffer[Int]()
      val category_tagidIndex = category_tagid(point._2)
      arr+=category_tagidIndex
//      val category_tagidFeatures = Array.ofDim[Double](category_tagid.size)
//      category_tagidFeatures(category_tagidIndex) = 1.0

      val category_browserIndex = category_browser(point._4)
      
      arr+=(category_browserIndex+category_tagid.size)
//      val category_browserFeatures = Array.ofDim[Double](category_browser.size)
//      category_browserFeatures(category_browserIndex) = 1.0

      val category_osIndex = category_os(point._3)
      arr+=(category_osIndex+category_browser.size+category_tagid.size)
//      val category_osFeatures = Array.ofDim[Double](category_os.size)
//      category_osFeatures(category_osIndex) = 1.0

      val category_provinceIndex = category_province(point._5)
      arr+=(category_provinceIndex+category_os.size+category_browser.size+category_tagid.size)
//      val category_provinceFeatures = Array.ofDim[Double](category_province.size)
//      category_provinceFeatures(category_provinceIndex) = 1.0

      val category_hourIndex = category_hour(point._6)
      arr+=(category_hourIndex+category_province.size+category_os.size+category_browser.size+category_tagid.size)
//      val category_hourFeatures = Array.ofDim[Double](category_hour.size)
//      category_hourFeatures(category_hourIndex) = 1.0

//      val features = category_tagidFeatures ++ category_browserFeatures ++ category_osFeatures ++ category_provinceFeatures ++ category_hourFeatures
      LabeledPoint(label, Vectors.sparse((category_hour.size+category_province.size+category_os.size+category_browser.size+category_tagid.size),arr.toArray,List.fill(arr.size)(1.0).toArray))
      }catch{
        case e:Exception=>LabeledPoint(0.0, Vectors.sparse((category_hour.size+category_province.size+category_os.size+category_browser.size+category_tagid.size),Array(1),Array(1.0)))
      }
      }.filter(x=>x.label>0.0)
    
    println(sdf.format( System.currentTimeMillis()))

    //对数据进行分区8等分用于训练模型，2等份用于测试
    val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L)
    val (trainingData, testData) = (splits(0), splits(1))
    //设置参数，最大深度设置为7，这里基本上离散变量 maxBins设置为2
    val categoricalFeaturesInfo = Map[Int, Int]() //当map为空时，表示特征属性为连续的情况  
    val impurity = "variance" //使用方差作为节点不纯度的度量 
    val maxDepth = maxDepth_int //树的最大深度  
    val maxBins = 2 //每个特征分裂时，最大的属性数目（一般是特征属性连续的情况）  

   /*   println("descies model will start.......")
    println(data.count())
    //  决策树回归
//    val  model11=DecisionTree.trainRegressor(, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
   val  model= DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
     val labelsAndPredictions = testData.map{ point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
   
   println("descies model predicton will start.......")
   labelsAndPredictions.take(400).foreach(println)
   
    val testMSEtreemodel = labelsAndPredictions.map { case (v, p) => math.pow((v - p), 2) }.mean()
    println("the MSE  caculate will start")
    println("Test Mean Squared Error = " + testMSEtreemodel)
    println("the R-square caculate will start.......")
     //测试数据的的摸SST：Total sum of squares，即原始数据和均值之差的平方和，R^2 以及SSE
    val  train_mean=trainingData.map(point=>point.label).reduce(_+_)/trainingData.count()
    val train_desci_SSR=trainingData.map(point=>model.predict(point.features)).map(x=>math.pow(x-train_mean
        ,2)).reduce(_+_)
        
     val  train_desci_SST=trainingData.map{point=>point.label}.map(x=>Math.pow(x-train_mean,2)).reduce(_+_)
     
     print("the descitreemodel  of  R-sqare is :")
     println(1-train_desci_SSR/train_desci_SST)
        
    //测试数据的的摸SST：Total sum of squares，即原始数据和均值之差的平方和，R^2 以及SSE

    val test_mean=testData.map(point=>point.label).reduce(_+_)/testData.map(point=>point.label).count()
    val  test_SSR=testData.map{ point =>
      model.predict(point.features)
    }.map(x=>math.pow(x-test_mean,2)).reduce(_+_)
     val  test_SST=testData.map{point=>point.label}.map(x=>Math.pow(x-test_mean,2)).reduce(_+_)

    val  testdata_R=1-test_SSR/test_SST
    println("the descitreemodel test data of R-sqare  is :"+testdata_R)*/
    
// println("Learned regression tree model:\n" + model.toDebugString)
//    model.save(sc, "D:\\SPARKCONFALL\\spark-1.6.0-bin-hadoop2.6\\data\\myModelPath111")
//    val sameModel = DecisionTreeModel.load(sc, "myModelPath")


    // 随机森林回归 

    val numTrees =5 // Use more in practice.
    val featureSubsetStrategy = "auto" // Let the algorithm choose.

    println("Randon forest model will strat .......")
    val model_rand = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo,
      numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
       println("random model indicate will caculated")
      val valuesAndPreds = trainingData.map{ point =>
          val prediction = model_rand.predict(point.features)
            (prediction, point.label)
            }
      
   
        val metrics = new RegressionMetrics(valuesAndPreds)
  println(s"COUNT=${trainingData.count} \n MSE = ${metrics.meanSquaredError} \n RMSE=${metrics.rootMeanSquaredError} \n R-squared = ${metrics.r2} \n MAE = ${metrics.meanAbsoluteError} \n Explained variance = ${metrics.explainedVariance}")
        println("random model testdata indicate will caculated")
     val labelsAndPredictions_random = testData.map { point =>
      val prediction = model_rand.predict(point.features)
      (prediction,point.label)
    }     
        val metrics1 = new RegressionMetrics(labelsAndPredictions_random)
      println("test data indicate will caculat")    
    println(s"COUNT=${testData.count}\nMSE = ${metrics1.meanSquaredError}\nRMSE=${metrics1.rootMeanSquaredError}\nR-squared = ${metrics1.r2}\nMAE = ${metrics1.meanAbsoluteError}\nExplained variance = ${metrics1.explainedVariance}")

  //模型结果输出

    //    println("Learned regression forest model:\n" + model_rand.toDebugString)

    //模型保存
    
      val path="/home/wilson/model"
//      val path="D:\\iclick\\model"
  
      val out = new ObjectOutputStream(new FileOutputStream(path))
      out.writeObject(model_rand)   
      out.close()
    
    
     
   

    //模型读取
    // val randomforestmodel_load = RandomForestModel.load(sc, "/user/wilson/model")

    //模型测试效果   
//   /* println("程序测试开始")
//    val   start_time=System.currentTimeMillis()
//   data.map(p=>randomforestmodel_load.predict(p.features)).collect()
//    println(data.count())
//    println("模型测试一天的数据的是时间是："+(System.currentTimeMillis()-start_time)/1000+"s")
//    */
    
    
    //存储到redis里面
    val  tagidPriceModel = new TagidPriceModel()
        
    tagidPriceModel.setDimension(category_tagid.size+category_browser.size+category_province.size+category_os.size+category_hour.size)
    tagidPriceModel.setTagidMap(category_tagid.asJava)
    tagidPriceModel.setBrowserMap(category_browser.asJava)
    tagidPriceModel.setProvinceMap(category_province.asJava)
    tagidPriceModel.setOsMap(category_os.asJava)
    tagidPriceModel.setHourMap(category_hour.asJava)
    
    val offsetMap = new java.util.HashMap[String, java.lang.Integer]
    offsetMap.put("tagid", 0)
    offsetMap.put("browser", category_tagid.size)
    offsetMap.put("os", category_tagid.size+category_browser.size)
    offsetMap.put("province",category_os.size+category_browser.size+category_tagid.size)
    offsetMap.put("hour",category_province.size+category_browser.size+category_os.size+category_tagid.size)
    
    tagidPriceModel.setOffsetMap(offsetMap)
    
    //模型读取
     val randomModelString=Base64.base64Str(path)
     tagidPriceModel.setModelBase64(randomModelString)
    
     //测试序列化话模型的数据
      val bytesStream = Base64.strBytes(randomModelString)
      val in = new ObjectInputStream(bytesStream);
       val randomforestmodel_load = in.readObject().asInstanceOf[RandomForestModel]     
//      data.map(p=>randomforestmodel_load.predict(p.features)).take(20).foreach(println)
     
    val json = JSON.toJSON(tagidPriceModel).toString()
//    println(json)
    
     val  writer=new java.io.PrintWriter(new java.io.File(path+".txt"))
    try{
      writer.println(json)
      writer.close()
    }catch{
      case e:Exception=>println("there has some  errors  ")
    }
    
    try{
    val sdf_redis = new SimpleDateFormat("yy-MM-dd")
    RedisClient.init(Config.getString("slotprice.redis.hosts"));
    RedisClient.set("ad.slot.info.v3."+sdf_redis.format(new Date), 30 * 24 * 3600, json)
    }catch{
      case e:Exception=>println(e)
    }    
    println("the start time is:" + sdf.format(start))
    println("the end time is :" + sdf.format(System.currentTimeMillis()))
    sc.stop()

  }
  
}