package com.userportrait
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.feature.Normalizer
/**
* Created by
*/objectUserAge {caseclassPerson(imei:String,model:String,k:Int)// case class Center(k:Int,center:Array[Double])def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("BisectingKMeans.age")
val sc = new SparkContext(sparkConf)
// def parse(line: String): Vector = Vectors.dense(line.split("\001").map(_.toDouble))// val data = sc.textFile("/user/hive/warehouse/tb_lava_user_dimensions")val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
import hiveContext.sql
val data =sql("select ex.imei,ex.model,ex.rn1,ex.rn2,ex.rn3,ex.rn4 from xxx ex");
val data2 =data.map { line =>
// line.mkString("\001")
line.toSeq.map {_.toString}.toArray
}
val parsedData = data2.map { line =>
(line(0),line(1),Vectors.dense(line.drop(1).drop(1).map(_.toDouble)))
}.cache()
// val parsedData = data.map { line =>// val parts = line.split('\001')// (parts(0),Vectors.dense(parts.drop(1).map(_.toDouble)))// }.cache()val trainData = new Normalizer().transform(parsedData.map(_._3))
val model = new BisectingKMeans().setK(4).run(trainData)
// println(s"Compute Cost: ${model.computeCost(trainData)}")// model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>// println(s"Cluster Center ${idx}: ${center}")// }// val cen = model.clusterCenters.zipWithIndex.map{// case (center, idx) =>// (idx,center.toArray)// }.map(x=>Center(x._1,x._2))// val hdfs = org.apache.hadoop.fs.FileSystem.get(sc.hadoopConfiguration)// val path = new Path("/user/oozie/data/user_dimensions_age")// if(hdfs.exists(path)){// hdfs.delete(path,true)// }RDD[((Int, String), String)]// model.predict(trainData).zip(parsedData.map(_._1)).map(line =>line._2+"\001"+line._1).saveAsTextFile("/user/oozie/data/user_dimensions_age")
model.predict(trainData).zip(parsedData.map(_._1)).zip(parsedData.map(_._2)).map(x=>Person(x._1._2,x._2,x._1._1)).toDF().write.mode(SaveMode.Overwrite).saveAsTable("tb_lava_user_dimensions_age")
sc.stop()
}
}
package com.userportrait
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{SaveMode, Row}
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
/**
* Created by
*/
object UserBusiness {
case class RawDataRecord(category: String, text: String)
case class Person(imei:String,model:String,k:Int)
def main(args: Array[String]) {
// val Array(date) = args
val sparkConf = new SparkConf().setAppName("NaiveBayes.business")
val sc = new SparkContext(sparkConf)
val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
// val dataDF = hiveContext.sql("select bid category, word words from tb_lava_user_category3 ").toDF()
val srcRDD = sc.textFile("/user/hive/warehouse/tb_business").map {
x =>
val data = x.split(",")
RawDataRecord(data(0),data(1))
}
//70%作为训练数据,30%作为测试数据
// val splits = srcRDD.randomSplit(Array(0.7, 0.3))
val trainingDF = srcRDD.toDF()
// var testDF = splits(1).toDF()
// val dataDF = hiveContext.sql("select category, text from tb_lava_user_text_category1").toDF()
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val wordsData = tokenizer.transform(trainingDF)
// wordsData.select($"category",$"text",$"words").take(1)
val hashingTF = new HashingTF().setNumFeatures(500000).setInputCol("words").setOutputCol("rawFeatures")
val featurizedData =hashingTF.transform(wordsData)
// featurizedData.select($"category", $"words", $"rawFeatures").take(1)
val idfModel = new IDF().setInputCol("rawFeatures").setOutputCol("features").fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
// rescaledData.select($"category", $"rawFeatures", $"features").take(1)
val trainDataRdd = rescaledData.select($"category",$"features").map {
case Row(label: String, features: Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
// trainDataRdd.getNumPartitions
// trainDataRdd.repartition(7)
val model = NaiveBayes.train(trainDataRdd, lambda = 1.0, modelType = "multinomial")
model.save(sc, "/user/spark/model/business/NaiveBayesModel")
val sameModel = NaiveBayesModel.load(sc,"/user/spark/model/business/NaiveBayesModel")
val testDF = hiveContext.sql("select tb.imei,tb.model,collect_list(enword) words from tb_lava_user_word tb group by tb.imei,tb.model").toDF()
val hashingTF = new HashingTF().setNumFeatures(500000).setInputCol("words").setOutputCol("rawFeatures")
val featurizedData = hashingTF.transform(testDF)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
//转换成Bayes的输入格式
val testDataRdd = rescaledData.select($"imei",$"model",$"features").map {
case Row(imei: String,model: String, features: Vector) =>
(imei,model,Vectors.dense(features.toArray))
}
testDataRdd.map(p =>Person (p._1,p._2,sameModel.predict(p._3).toInt)).toDF().write.mode(SaveMode.Overwrite).saveAsTable("tb_lava_user_dimensions_business")
sc.stop()
}
}
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object UserLabelHbase {
def main(args: Array[String]) {
val Array(date)=args
// val date="2016-12-08"
val tableName="userlabel"
val sparkConf = new SparkConf().setAppName("UserLabelHbase")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "192.168.35.94,192.168.35.95,192.168.35.96")
// conf.set("hbase.rootdir", "/hbase")
// conf.set("mapred.child.java.opts","-XX:+UseParallelGC -XX:ParallelGCThreads=4 -XX:GCTimeRatio=10 -XX:YoungGenerationSizeIncrement=20 -XX:TenuredGenerationSizeIncrement=20 -XX:AdaptiveSizeDecrementScaleFactor=2 -Xmx2000m");
// val hbaseContext = new HBaseContext(sc, conf);
// val scan = new Scan()
// val scanRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)
// hbaseContext.bulkDelete[Array[Byte]](scanRdd, TableName.valueOf(tableName),putRecord => new Delete(putRecord),4);
val admin = new HBaseAdmin(conf)
admin.disableTable(tableName)
admin.truncateTable(TableName.valueOf(tableName),false)
// admin.enableTable(tableName)
val jobConf = new JobConf(conf,this.getClass)
jobConf.setOutputFormat(classOf[TableOutputFormat])
jobConf.set(TableOutputFormat.OUTPUT_TABLE,tableName)
jobConf.set("mapreduce.output.fileoutputformat.outputdir", "/user/spark/userlabel")
val hiveContext = new HiveContext(sc)
val sql = s"select reverse(imei) imei,model,mnc, label from tb_lava_user_group_merge where date='$date' "
// println(sql)
val df = hiveContext.sql(sql).toDF().repartition(2).map{
case Row(imei: String,model:String,mnc:String,label:String) =>
val p = new Put(Bytes.toBytes(imei))
p.addColumn(Bytes.toBytes("info"),Bytes.toBytes("model"),Bytes.toBytes(model))
p.addColumn(Bytes.toBytes("info"),Bytes.toBytes("mnc"),Bytes.toBytes(mnc))
p.addColumn(Bytes.toBytes("info"),Bytes.toBytes(label),Bytes.toBytes("1"))
(new ImmutableBytesWritable, p)
}
df.saveAsHadoopDataset(jobConf)
sc.stop()
}
}