DataFrame:通过SparkSql将Json数据转为DataFrame

import java.text.DecimalFormat
import com.alibaba.fastjson.JSON
import com.donews.data.AppConfig
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
/**
  * Created by silentwolf on 2016/6/3.
  */

object UserTagTable{

  val LOG = LoggerFactory.getLogger(UserOverviewFirst.getClass)

  val REP_HOME = s"${AppConfig.HDFS_MASTER}/${AppConfig.HDFS_REP}"

  def main(args: Array[String]) {

    var startTime = System.currentTimeMillis()

    val conf: com.typesafe.config.Config = ConfigFactory.load()

    val sc = new SparkContext()

    val sqlContext = new SQLContext(sc)

    var df1: DataFrame = null

    if (args.length == 0) {
      println("请输入: appkey , StartTime : 2016-04-10  ,StartEnd :2016-04-11")
    }
    else {

      var appkey = args(0)

      var lastdate = args(1)

      df1 = loadDataFrame(sqlContext, appkey, "2016-04-10", lastdate)

      df1.registerTempTable("suuidTable")

      sqlContext.udf.register("taginfo", (a: String) => userTagInfo(a))
      sqlContext.udf.register("intToString", (b: Long) => intToString(b))


      //***重点***:将临时表中的suuid和自定函数中Json数据,转为Json数据。通过 sqlContext.read.json读取转为DataFrame。
      sqlContext.read.json(sqlContext.sql(" select distinct(suuid) AS suuid,taginfo(suuid) from  suuidTable group by suuid").map(x => {

        val suuid:String = x.get(0).toString
        val taginfo = x.get(1).toString
        val temp = JSON.parseObject(taginfo)
        "{" +"\"SUUID\"" + ":" + suuid + "," +"\"MAN\"" + ":" + temp.get("man") + "," + "\"WOMAN\"" + ":" + temp.get("woman") + "," + "\"AGE10_19\"" + ":" + temp.get("age10_19") + "," +
          "\"AGE20_29\"" + ":" + temp.get("age20_29") + "," + "\"AGE30_39\"" + ":" + temp.get("age30_39") + "," + "\"AGE40_49\"" + ":" + temp.get("age40_49") + "," +
          "\"AGE50_59\"" + ":" + temp.get("age50_59") + "," + "\"GAME\"" + ":" + temp.get("game") + "," + "\"MOVIE\"" + ":" + temp.get("movie") + "," +
          "\"MUSIC\"" + ":" + temp.get("music") + "," + "\"ART\"" + ":" + temp.get("art") + "," + "\"POLITICS_NEWS\"" + ":" + temp.get("politics_news") + "," +
          "\"FINANCIAL\"" + ":" + temp.get("financial") + "," + "\"EDUCATION_TRAINING\"" + ":" + temp.get("education_training") + "," + "\"HEALTH_CARE\"" + ":" + temp.get("health_care") + "," +
          "\"TRAVEL\"" + ":" + temp.get("travel") + "," + "\"AUTOMOBILE\"" + ":" + temp.get("automobile") + "," + "\"HOUSE_PROPERTY\"" + ":" + temp.get("house_property") + "," +
          "\"CLOTHING_ACCESSORIES\"" + ":" + temp.get("clothing_accessories") + "," + "\"BEAUTY\"" + ":" + temp.get("beauty") + "," + "\"IT\"" + ":" + temp.get("IT") + "," +
          "\"BABY_PRODUCT\"" + ":" + temp.get("baby_Product") + "," + "\"FOOD_SERVICE\"" + ":" + temp.get("food_service") + "," + "\"HOME_FURNISHING\"" + ":" + temp.get("home_furnishing") + "," +
          "\"SPORTS\"" + ":" + temp.get("sports") + "," + "\"OUTDOOR_ACTIVITIES\"" + ":" + temp.get("outdoor_activities") + "," + "\"MEDICINE\"" + ":" + temp.get("medicine") +
          "}";
      })).registerTempTable("resultTable")

      val resultDF = sqlContext.sql(s"select '$appkey' AS APPKEY, '$lastdate' AS DATE,intToString(SUUID) AS SUUID,MAN,WOMAN,AGE10_19,AGE20_29,AGE30_39 ," +
        "AGE40_49 ,AGE50_59,GAME,MOVIE,MUSIC,ART,POLITICS_NEWS,FINANCIAL,EDUCATION_TRAINING,HEALTH_CARE,TRAVEL,AUTOMOBILE," +
        "HOUSE_PROPERTY,CLOTHING_ACCESSORIES,BEAUTY,IT,BABY_PRODUCT ,FOOD_SERVICE ,HOME_FURNISHING ,SPORTS ,OUTDOOR_ACTIVITIES ," +
        "MEDICINE from resultTable WHERE SUUID IS NOT NULL")
      resultDF.write.mode(SaveMode.Overwrite).options(
        Map("table" -> "USER_TAGS", "zkUrl" -> conf.getString("Hbase.url"))
      ).format("org.apache.phoenix.spark").save()

    }
  }

  def intToString(suuid: Long): String = {
    suuid.toString()
  }

  def userTagInfo(num1: String): String = {

    var de = new DecimalFormat("0.00")
    var mannum = de.format(math.random).toFloat
    var man = mannum
    var woman = de.format(1 - mannum).toFloat

    var age10_19num = de.format(math.random * 0.2).toFloat
    var age20_29num = de.format(math.random * 0.2).toFloat
    var age30_39num = de.format(math.random * 0.2).toFloat
    var age40_49num = de.format(math.random * 0.2).toFloat

    var age10_19 = age10_19num
    var age20_29 = age20_29num
    var age30_39 = age30_39num
    var age40_49 = age40_49num
    var age50_59 = de.format(1 - age10_19num - age20_29num - age30_39num - age40_49num).toFloat

    var game = de.format(math.random * 1).toFloat
    var movie = de.format(math.random * 1).toFloat
    var music = de.format(math.random * 1).toFloat
    var art = de.format(math.random * 1).toFloat
    var politics_news = de.format(math.random * 1).toFloat

    var financial = de.format(math.random * 1).toFloat
    var education_training = de.format(math.random * 1).toFloat
    var health_care = de.format(math.random * 1).toFloat
    var travel = de.format(math.random * 1).toFloat
    var automobile = de.format(math.random * 1).toFloat

    var house_property = de.format(math.random * 1).toFloat
    var clothing_accessories = de.format(math.random * 1).toFloat
    var beauty = de.format(math.random * 1).toFloat
    var IT = de.format(math.random * 1).toFloat
    var baby_Product = de.format(math.random * 1).toFloat

    var food_service = de.format(math.random * 1).toFloat
    var home_furnishing = de.format(math.random * 1).toFloat
    var sports = de.format(math.random * 1).toFloat
    var outdoor_activities = de.format(math.random * 1).toFloat
    var medicine = de.format(math.random * 1).toFloat

    "{" +"\"man\"" + ":" + man + "," + "\"woman\"" + ":" + woman + "," + "\"age10_19\"" + ":" + age10_19 + "," + "\"age20_29\"" + ":" + age20_29 + "," +
      "\"age30_39\"" + ":" + age30_39 + "," + "\"age40_49\"" + ":" + age40_49 + "," + "\"age50_59\"" + ":" + age50_59 + "," + "\"game\"" + ":" + game + "," +
      "\"movie\"" + ":" + movie + "," + "\"music\"" + ":" + music + "," + "\"art\"" + ":" + art + "," + "\"politics_news\"" + ":" + politics_news + "," +
      "\"financial\"" + ":" + financial + "," + "\"education_training\"" + ":" + education_training + "," + "\"health_care\"" + ":" + health_care + "," +
      "\"travel\"" + ":" + travel + "," + "\"automobile\"" + ":" + automobile + "," + "\"house_property\"" + ":" + house_property + "," + "\"clothing_accessories\"" + ":" + clothing_accessories + "," +
      "\"beauty\"" + ":" + beauty + "," + "\"IT\"" + ":" + IT + "," + "\"baby_Product\"" + ":" + baby_Product + "," + "\"food_service\"" + ":" + food_service + "," +
      "\"home_furnishing\"" + ":" + home_furnishing + "," + "\"sports\"" + ":" + sports + "," + "\"outdoor_activities\"" + ":" + outdoor_activities + "," + "\"medicine\"" + ":" + medicine +
      "}";

  }

  def loadDataFrame(ctx: SQLContext, appkey: String, startDay: String, endDay: String): DataFrame = {
    val path = s"$REP_HOME/appstatistic"
    ctx.read.parquet(path)
      .filter(s"timestamp is not null and appkey='$appkey'  and day>='$startDay' and day<='$endDay'")
  }



}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

北京小辉

你的鼓舞将是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值