在广告计算行业,经常要用到版位价格预测,这不正好凑巧做了一个,模型基本思路是,选用的是随机森林模型,模型在spark里面训练好,存储在redis中,然后在别的地方调用,看看主程序吧,csdn没有scala的语言关键字,只能用python了,具体看看代码吧:
package com.iclick.spark.buzzads.stats
/**
* @author wilson.zhou
*/
import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.sql.SQLContext
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.rdd.RDD
import com.buzzinate.common.util.ip.IPUtils
import com.iclick.spark.buzzads.udf.ParseWinnerLog
import com.iclick.spark.buzzads.udf.GetNumArrayStatInfo
import org.apache.commons.io.FileUtils
import java.io.File
import com.iclick.spark.buzzads.model.JSlotInfo
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
import com.alibaba.fastjson.JSON
import java.util.Calendar
import java.text.SimpleDateFormat
import java.util.Date
import com.iclick.spark.common.util.Loggable
import com.iclick.spark.common.util.PathUtil
import com.iclick.spark.common.util.RedisClient
import com.iclick.spark.common.util.Config
import com.iclick.spark.common.util.ReadHdfsFileFunctions._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.stat.Statistics
import com.alibaba.fastjson.JSON
import scala.collection.mutable.ArrayBuffer
import com.iclick.spark.buzzads.model.TagidPriceModel
import com.iclick.spark.buzzads.util.Base64
import java.io.ObjectOutputStream
import java.io.FileOutputStream
import java.io.ObjectInputStream
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.util.MLUtils
object PricePredition {
def main(args: Array[String]): Unit = {
println("hello world")
//关闭一些不必要的日志
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
if (args.length < 2) {
System.err.println("Usage: days date")
System.exit(1)
}
val days = Integer.parseInt(args(0))
val date = args(1)
val maxDepth_int=args(2).toInt
val paths = PathUtil.getPaths("/rawdata/logcompress/rtbWinner/", days, "ds=", date) //"/Users/Andy/data/rtbreq_json/part-00006"
val conf = new SparkConf()
if (System.getProperty("local") != null) {
conf.setMaster("local").setAppName("tagid price prediction")
}
// val paths = "d:\\wilson.zhou\\Desktop\\e-PDF To Word Converter V2.5汉化破解版\\winnerrtb"
// val conf = new SparkConf().setAppName("AnalyzeSlotPrice").setMaster("local[4]"); val maxDepth_int =15
val start = System.currentTimeMillis()
val sdf = new java.text.SimpleDateFormat("yyyy-MM-dd:HH:mm:ss")
val sc = new SparkContext(conf)
val jsonRdd = sc.bytesJson(paths)
//获取版位、省份、os、小时、价格、browser情况
val kvRdd_temp = jsonRdd.map { json =>
try {
val winner = ParseWinnerLog.parse(json)
val price = winner.getOrElse("price", 0.0).toString.toDouble
val tagid = winner.getOrElse("bid", "_test").toString.trim.split("_")(0) + winner.getOrElse("tagid", "?").toString
val os = winner.getOrElse("os", "?").toString
val browser = winner.getOrElse("browser", "?").toString
val province = winner.getOrElse("province", "?").toString
val hour = winner.getOrElse("hour", "?").toString
(price, tagid, os, browser, province, hour)
} catch {
case e: Exception =>
println(json); val (price, tagid, os, browser, province, hour) = (0.0, "?", "?", "?", "?", "?"); (price, tagid, os, browser, province, hour)
}
}.filter(x => x._1 != 0.0 && x._2 != "?" && x._3 != "?" && x._4 != "?" && x._5 != "?" && x._6 != "?")
// .filter(x => x._1 != 0.0 && x._2 != "?" && x._3 != "?" && x._4 != "?" && x._5 != "?" && x._6 != "?")
println("winner data has getted")
//过滤掉版本流量较少的版本
val kvRdd_temp1 = kvRdd_temp.groupBy((f: (Double, String, String, String, String, String)) => (f._2, f._5, f._6)).map {
item =>
val size = item._2.size
(item._2, size)
}.filter(x => x._2 > 10)
val kvRdd_filter_outline_befor = kvRdd_temp1.flatMap(x => x._1)
//对版位的价格进行预处理,过滤掉一些价格异常的情况,极大情况,用正态分布情况,过滤掉
// kvRdd_filter_outline_befor.take(500).foreach(println)
val sum = kvRdd_filter_outline_befor.map(_._1).reduce(_ + _)
val count = kvRdd_filter_outline_befor.map(_._1).count()
val mean = sum * 1.0 / count
val var_n = kvRdd_filter_outline_befor.map(_._1).map(x => Math.pow((x - mean), 2)).reduce(_ + _)
val var_squrt = Math.sqrt(var_n / count)
// .filter(x=>x._1>mean-3*var_squrt && x._1<=mean+3*var_squrt )
// val kvRdd = kvRdd_filter_outline_befor.filter(x => x._1 > mean - 2 * var_squrt && x._1 <= mean + 4 * var_squrt)
val kvRdd = kvRdd_filter_outline_befor.filter(x => x._1 <= mean + 3 * var_squrt)
println("before filter count")
println(kvRdd_filter_outline_befor.count())
println("after filter count")
println(kvRdd.count())
println(kvRdd.getClass)
// kvRdd.take(5).foreach(println)
//转换为哑变量的形式
val category_tagid = kvRdd.map(r => r._2.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String, java.lang.Integer]]
val category_browser = kvRdd.map(r => r._4.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
val category_os = kvRdd.map(r => r._3.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
val category_province = kvRdd.map(r => r._5.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
val category_hour = kvRdd.map(r => r._6.trim).distinct().collect().zipWithIndex.toMap.asInstanceOf[Map[String,java.lang.Integer]]
// println(category_browser)
val data = kvRdd.map { point =>
try{
val label = point._1
var arr=ArrayBuffer[Int]()
val category_tagidIndex = category_tagid(point._2)
arr+=category_tagidIndex
// val category_tagidFeatures = Array.ofDim[Double](category_tagid.size)
// category_tagidFeatures(category_tagidIndex) = 1.0
val category_browserIndex = category_browser(point._4)
arr+=(category_browserIndex+category_tagid.size)
// val category_browserFeatures = Array.ofDim[Double](category_browser.size)
// category_browserFeatures(category_browserIndex) = 1.0
val category_osIndex = category_os(point._3)
arr+=(category_osIndex+category_browser.size+category_tagid.size)
// val category_osFeatures = Array.ofDim[Double](category_os.size)
// category_osFeatures(category_osIndex) = 1.0
val category_provinceIndex = category_province(point._5)
arr+=(category_provinceIndex+category_os.size+category_browser.size+category_tagid.size)
// val category_provinceFeatures = Array.ofDim[Double](category_province.size)
// category_provinceFeatures(category_provinceIndex) = 1.0
val category_hourIndex = category_hour(point._6)
arr+=(category_hourIndex+category_province.size+category_os.size+category_browser.size+category_tagid.size)
// val category_hourFeatures = Array.ofDim[Double](category_hour.size)
// category_hourFeatures(category_hourIndex) = 1.0
// val features = category_tagidFeatures ++ category_browserFeatures ++ category_osFeatures ++ category_provinceFeatures ++ category_hourFeatures
LabeledPoint(label, Vectors.sparse((category_hour.size+category_province.size+category_os.size+category_browser.size+category_tagid.size),arr.toArray,List.fill(arr.size)(1.0).toArray))
}catch{
case e:Exception=>LabeledPoint(0.0, Vectors.sparse((category_hour.size+category_province.size+category_os.size+category_browser.size+category_tagid.size),Array(1),Array(1.0)))
}
}.filter(x=>x.label>0.0)
println(sdf.format( System.currentTimeMillis()))
//对数据进行分区8等分用于训练模型,2等份用于测试
val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L)
val (trainingData, testData) = (splits(0), splits(1))
//设置参数,最大深度设置为7,这里基本上离散变量 maxBins设置为2
val categoricalFeaturesInfo = Map[Int, Int]() //当map为空时,表示特征属性为连续的情况
val impurity = "variance" //使用方差作为节点不纯度的度量
val maxDepth = maxDepth_int //树的最大深度
val maxBins = 2 //每个特征分裂时,最大的属性数目(一般是特征属性连续的情况)
/* println("descies model will start.......")
println(data.count())
// 决策树回归
// val model11=DecisionTree.trainRegressor(, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
val model= DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
val labelsAndPredictions = testData.map{ point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
println("descies model predicton will start.......")
labelsAndPredictions.take(400).foreach(println)
val testMSEtreemodel = labelsAndPredictions.map { case (v, p) => math.pow((v - p), 2) }.mean()
println("the MSE caculate will start")
println("Test Mean Squared Error = " + testMSEtreemodel)
println("the R-square caculate will start.......")
//测试数据的的摸SST:Total sum of squares,即原始数据和均值之差的平方和,R^2 以及SSE
val train_mean=trainingData.map(point=>point.label).reduce(_+_)/trainingData.count()
val train_desci_SSR=trainingData.map(point=>model.predict(point.features)).map(x=>math.pow(x-train_mean
,2)).reduce(_+_)
val train_desci_SST=trainingData.map{point=>point.label}.map(x=>Math.pow(x-train_mean,2)).reduce(_+_)
print("the descitreemodel of R-sqare is :")
println(1-train_desci_SSR/train_desci_SST)
//测试数据的的摸SST:Total sum of squares,即原始数据和均值之差的平方和,R^2 以及SSE
val test_mean=testData.map(point=>point.label).reduce(_+_)/testData.map(point=>point.label).count()
val test_SSR=testData.map{ point =>
model.predict(point.features)
}.map(x=>math.pow(x-test_mean,2)).reduce(_+_)
val test_SST=testData.map{point=>point.label}.map(x=>Math.pow(x-test_mean,2)).reduce(_+_)
val testdata_R=1-test_SSR/test_SST
println("the descitreemodel test data of R-sqare is :"+testdata_R)*/
// println("Learned regression tree model:\n" + model.toDebugString)
// model.save(sc, "D:\\SPARKCONFALL\\spark-1.6.0-bin-hadoop2.6\\data\\myModelPath111")
// val sameModel = DecisionTreeModel.load(sc, "myModelPath")
// 随机森林回归
val numTrees =5 // Use more in practice.
val featureSubsetStrategy = "auto" // Let the algorithm choose.
println("Randon forest model will strat .......")
val model_rand = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo,
numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
println("random model indicate will caculated")
val valuesAndPreds = trainingData.map{ point =>
val prediction = model_rand.predict(point.features)
(prediction, point.label)
}
val metrics = new RegressionMetrics(valuesAndPreds)
println(s"COUNT=${trainingData.count} \n MSE = ${metrics.meanSquaredError} \n RMSE=${metrics.rootMeanSquaredError} \n R-squared = ${metrics.r2} \n MAE = ${metrics.meanAbsoluteError} \n Explained variance = ${metrics.explainedVariance}")
println("random model testdata indicate will caculated")
val labelsAndPredictions_random = testData.map { point =>
val prediction = model_rand.predict(point.features)
(prediction,point.label)
}
val metrics1 = new RegressionMetrics(labelsAndPredictions_random)
println("test data indicate will caculat")
println(s"COUNT=${testData.count}\nMSE = ${metrics1.meanSquaredError}\nRMSE=${metrics1.rootMeanSquaredError}\nR-squared = ${metrics1.r2}\nMAE = ${metrics1.meanAbsoluteError}\nExplained variance = ${metrics1.explainedVariance}")
//模型结果输出
// println("Learned regression forest model:\n" + model_rand.toDebugString)
//模型保存
val path="/home/wilson/model"
// val path="D:\\iclick\\model"
val out = new ObjectOutputStream(new FileOutputStream(path))
out.writeObject(model_rand)
out.close()
//模型读取
// val randomforestmodel_load = RandomForestModel.load(sc, "/user/wilson/model")
//模型测试效果
// /* println("程序测试开始")
// val start_time=System.currentTimeMillis()
// data.map(p=>randomforestmodel_load.predict(p.features)).collect()
// println(data.count())
// println("模型测试一天的数据的是时间是:"+(System.currentTimeMillis()-start_time)/1000+"s")
// */
//存储到redis里面
val tagidPriceModel = new TagidPriceModel()
tagidPriceModel.setDimension(category_tagid.size+category_browser.size+category_province.size+category_os.size+category_hour.size)
tagidPriceModel.setTagidMap(category_tagid.asJava)
tagidPriceModel.setBrowserMap(category_browser.asJava)
tagidPriceModel.setProvinceMap(category_province.asJava)
tagidPriceModel.setOsMap(category_os.asJava)
tagidPriceModel.setHourMap(category_hour.asJava)
val offsetMap = new java.util.HashMap[String, java.lang.Integer]
offsetMap.put("tagid", 0)
offsetMap.put("browser", category_tagid.size)
offsetMap.put("os", category_tagid.size+category_browser.size)
offsetMap.put("province",category_os.size+category_browser.size+category_tagid.size)
offsetMap.put("hour",category_province.size+category_browser.size+category_os.size+category_tagid.size)
tagidPriceModel.setOffsetMap(offsetMap)
//模型读取
val randomModelString=Base64.base64Str(path)
tagidPriceModel.setModelBase64(randomModelString)
//测试序列化话模型的数据
val bytesStream = Base64.strBytes(randomModelString)
val in = new ObjectInputStream(bytesStream);
val randomforestmodel_load = in.readObject().asInstanceOf[RandomForestModel]
// data.map(p=>randomforestmodel_load.predict(p.features)).take(20).foreach(println)
val json = JSON.toJSON(tagidPriceModel).toString()
// println(json)
val writer=new java.io.PrintWriter(new java.io.File(path+".txt"))
try{
writer.println(json)
writer.close()
}catch{
case e:Exception=>println("there has some errors ")
}
try{
val sdf_redis = new SimpleDateFormat("yy-MM-dd")
RedisClient.init(Config.getString("slotprice.redis.hosts"));
RedisClient.set("ad.slot.info.v3."+sdf_redis.format(new Date), 30 * 24 * 3600, json)
}catch{
case e:Exception=>println(e)
}
println("the start time is:" + sdf.format(start))
println("the end time is :" + sdf.format(System.currentTimeMillis()))
sc.stop()
}
}