SparkStreaming实时数仓——首单分析(下)

一、准备其它实时维度表

1.1 在pheonix中创建其它需要的维度表

gmall_spu_info

create table gmall_spu_info (id varchar primary key, spu_name varchar)SALT_BUCKETS = 2 

gmall_base_trademark

create table gmall_base_trademark (id varchar primary key, tm_name varchar)SALT_BUCKETS = 2
 

gmall_base_category3

create table gmall_base_category3 (id varchar primary key, name varchar, category2_id varchar)SALT_BUCKETS = 2;
 

gmall_sku_info

create table gmall_sku_info (id varchar primary key, spu_id varchar, price varchar, sku_name varchar, tm_id varchar, category3_id varchar, create_time varchar, category3_name varchar, spu_name varchar, tm_name varchar) SALT_BUCKETS = 2;

1.2 将这四个维度表的数据初始化到maxwell

#初始化品类表
bin/maxwell-bootstrap --user maxwell  --password aaaaaa --host hadoop162  --database gmall --table spu_info --client_id maxwell_1

#初始化品牌表
bin/maxwell-bootstrap --user maxwell  --password aaaaaa --host hadoop162  --database gmall --table base_trademark --client_id maxwell_1

#初始化三级分类表
bin/maxwell-bootstrap --user maxwell  --password aaaaaa --host hadoop162  --database gmall --table base_category3 --client_id maxwell_1

#初始化商品表
bin/maxwell-bootstrap --user maxwell  --password aaaaaa --host hadoop162  --database gmall --table sku_info --client_id maxwell_1

1.3 执行BaseDBMaxwellApp类

--执行此类,将初始化后的数据写入到ods层

1.4 将数据写入到phoenix

将新增的四个主题的数据通过执SaveToPhoenix写入到phoenix

思路

这里商品表需要,将品排表的品牌名,三级分类表的三级分类名以及品类表的品类做join操作

代码: 

 saveToPhoenix[SpuInfo](rdd,
          "ods_spu_info",
          "gmall_spu_info",
          Seq("ID", "SPU_NAME"))

        saveToPhoenix[BaseCategory3](rdd,
          "ods_base_category3",
          "gmall_base_category3",
          Seq("ID", "NAME", "CATEGORY2_ID"))

        saveToPhoenix[BaseTrademark](rdd,
          "ods_base_trademark",
          "gmall_base_trademark",
          Seq("ID", "TM_NAME"))

        saveToPhoenix[SkuInfo](rdd,
          "ods_sku_info",
          "gmall_sku_info",
          Seq("ID", "SPU_ID", "PRICE", "SKU_NAME", "TM_ID", "CATEGORY3_ID", "CREATE_TIME", "CATEGORY3_NAME", "SPU_NAME", "TM_NAME"))

        import org.apache.phoenix.spark._
        val url = "jdbc:phoenix:hadoop162,hadoop163,hadoop164:2181"
        spark.read.jdbc(url, "gmall_sku_info", new Properties()).createOrReplaceTempView("sku")
        spark.read.jdbc(url, "gmall_spu_info", new Properties()).createOrReplaceTempView("spu")
        spark.read.jdbc(url, "gmall_base_category3", new Properties()).createOrReplaceTempView("category3")
        spark.read.jdbc(url, "gmall_base_trademark", new Properties()).createOrReplaceTempView("tm")
        spark.sql(
          """
            |select
            |    sku.id as id,
            |    sku.spu_id spu_id,
            |    sku.price price,
            |    sku.sku_name sku_name,
            |    sku.tm_id  tm_id,
            |    sku.category3_id  category3_id,
            |    sku.create_time  create_time,
            |    category3.name  category3_name,
            |    spu.spu_name  spu_name,
            |    tm.tm_name  tm_name
            |from sku
            |join spu on sku.spu_id=spu.id
            |join category3 on sku.category3_id=category3.id
            |join tm on sku.tm_id=tm.id
            |""".stripMargin)
          .as[SkuInfo]
          .rdd
          .saveToPhoenix(
            "gmall_sku_info",
            Seq("ID", "SPU_ID", "PRICE", "SKU_NAME", "TM_ID", "CATEGORY3_ID", "CREATE_TIME", "CATEGORY3_NAME", "SPU_NAME", "TM_NAME"),
            zkUrl = Option("hadoop162,hadoop163,hadoop164:2181"))


二、创建订单详情表写入到dwd层

2.1 创建新增的四个表的样例类

package com.atguigu.realtime.bean
case class BaseTrademark(tm_id:String , tm_name:String)

case class BaseCategory3(id: String,
                         name: String,
                         category2_id: String)
case class SpuInfo(id: String, spu_name: String)

case class SkuInfo(id: String,
                   spu_id: String,
                   price: String,
                   sku_name: String,
                   tm_id: String,
                   category3_id: String,
                   create_time: String,

                   var category3_name: String = null,
                   var spu_name: String = null,
                   var tm_name: String = null)

2.2 创建OrderDetail的样例类

package com.atguigu.realtime.bean

case class OrderDetail(id: Long,
                       order_id: Long,
                       sku_id: Long,
                       order_price: Double,
                       sku_num: Long,
                       sku_name: String,
                       create_time: String,

                       var spu_id: Long = 0L, //作为维度数据 要关联进来
                       var tm_id: Long = 0L,
                       var category3_id: Long = 0L,
                       var spu_name: String = null,
                       var tm_name: String = null,
                       var category3_name: String = null) {
  def mergeSkuInfo(skuInfo:SkuInfo) = {
    this.spu_id = skuInfo.spu_id.toLong
    this.tm_id = skuInfo.tm_id.toLong
    this.category3_id = skuInfo.category3_id.toLong
    this.spu_name = skuInfo.spu_name
    this.tm_name = skuInfo.tm_name
    this.category3_name = skuInfo.category3_name
    this
  }
}

2.2 创建DwdOrderDetail类——将订单详情表写入dwd层

思路

1.先读取ods_order_detail中的数据解析
	解析出来之后在去join Sku_info中的数据
	使用transform.map(str => {
                    使用sparksql读到sku_info的信息——【先定义一个spaeksession】
                    /*************************************************************/
                       val spark: SparkSession = SparkSession.builder()
      					.config(ssc.sparkContext.getConf)
      					.getOrCreate()
    					import spark.implicits._
                 /**********************************************************/
                   spark.read.jdbc()   --得到一个DF
                    --将DF解析成一个样例类
                    .as[SkuInfo]  
                    .rdd
                    --再转转成rdd,转成rdd的时候要是k-v,所以做map操作
                    .map(sku => (sku.id,sku))
                    
                    --做join
                    rdd
                    	.map(detail => (detail.sku_id.toString , detail))
                    	--做join
                    	.join(skuRDD)
                    	--做完join后再做连接操作,做map操作
                    	.map{
                    --连接 左连接,右连接,全连接
                    			case (skuid , (detail , sku )) =>
                    	--两表数据已经连接了
                    				detail.mergeSkuInfo(sku)
                    }
                    })
                    
                    --再将数据写入到kafka
                    .foreachRDD(rdd => {
                        rdd.foreachPartition (it => {
                              --先拿到一个生产者
                          val producer = MykafkaUtil.getProducer
                          --it里面有多个,所以做一个遍历
                          it.foreach( detail => {
                                     implicit val f = org.json4s.DefaultFormats
                                     producer.send(new ProducerRecord[String,String]("dwd_order_detail",Serialization.write(detail)))
                                     })
                          producer.close()
                        })
                      OffserManager.saveOffsets(OffserRanges,groupId,topic)
                     })
	
2.再将其写入kafka

代码

--继承BaseApp,实现他的run方法
	定义一个sparksession,然后将数据解析出来之后与读到的sku_info的信息作join
之后进行合并,使使用全连接合并,得到合并后的详情数据,再将其写进kafka
	遍历其中的数据,调用封装好的kafka的工具类,调用其生产者方法,将数据写入kafka中

package com.atguigu.realtime.dwd

import java.util.Properties

import com.atguigu.realtime.BaseApp
import com.atguigu.realtime.bean.{OrderDetail, SkuInfo}
import com.atguigu.realtime.util.{MyKafkaUtil_1, OffsetManager}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.OffsetRange
import org.json4s.jackson.{JsonMethods, Serialization}

import scala.collection.mutable.ListBuffer

object DwdOrderDetail extends BaseApp {
  override val master: String = "local[2]"
  override val appName: String = "DwdOrderDetail"
  override val groupId: String = "DwdOrderDetail"
  override val topic: String = "ods_order_detail"
  override val bachTime: Int = 3

  override def run(ssc: StreamingContext,
                   offsetRanges: ListBuffer[OffsetRange],
                   sourceStream: DStream[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .config(ssc.sparkContext.getConf)
      .getOrCreate()
    import spark.implicits._

    sourceStream
      .map(str => {

        implicit val format = f + toLong + toDouble
        JsonMethods.parse(str).extract[OrderDetail]
      })

      .transform(rdd => {
        //读sku_info的信息
        val skuRDD: RDD[(String, SkuInfo)] = spark
          .read
          .jdbc("jdbc:phoenix:hadoop162,hadoop163,hadoop164:2181", "gmall_sku_info", new Properties())
          .as[SkuInfo]
          .rdd
          .map(sku => (sku.id, sku))

        //做join操作
        rdd
          .map(detail => (detail.sku_id.toString, detail))
          .join(skuRDD)
          .map {
            case (skuId, (detail, sku)) =>
              detail.mergeSkuInfo(sku)
          }
      })
      //写入kafka
      .foreachRDD(rdd => {
        rdd.foreachPartition(it => {
          val producer: KafkaProducer[String, String] = MyKafkaUtil_1.getProducer
          it.foreach(detail => {
            implicit val f = org.json4s.DefaultFormats
            producer.send(new ProducerRecord[String,String]("dwd_order_detail",Serialization.write(detail)))
          })

          producer.close()

        })
        OffsetManager.saveOffsets(offsetRanges, groupId, topic)
      })
  }
}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值