离线数据处理 任务一:数据抽取

本文详细介绍了如何进行离线数据处理任务,包括从MySQL到Hive的数据抽取,涉及user_info、sku_info、base_province、base_region、order_info和order_detail等多个表的增量数据抽取,并使用Hive CLI展示分区。
摘要由CSDN通过智能技术生成

数据准备

 hive数据库和表的创建

任务 


数据准备

        准备数据文件

数据获取:https://download.csdn.net/download/dafsq/87635868?spm=1001.2014.3001.5501 

         在mysql中创建表和导入数据

        数据导入完成在mysql中查看表

 hive数据库和表的创建

         1、创建ods数据库

create database ods;

        2、创建user_info 表,分区字段是 etl_date

CREATE TABLE `user_info`  (
  `id` bigint,
  `login_name` string,
  `nick_name` string,
  `passwd` string,
  `name` string,
  `phone_num` string,
  `email` string,
  `head_img` string,
  `user_level` string,
  `birthday` timestamp,
  `gender` string,
  `create_time` timestamp,
  `operate_time` timestamp
)  PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;

       3、创建sku_info 表,分区字段也是 etl_date

CREATE TABLE `sku_info`  (
  `id` bigint,
  `spu_id` bigint,
  `price` decimal(10, 0),
  `sku_name` string,
  `sku_desc`  string,
  `weight` decimal(10, 2),
  `tm_id` bigint,
  `category3_id` bigint,
  `sku_default_img` string,
  `create_time` timestamp
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;

        4、创建base_province 表,分区字段 etl_date

CREATE TABLE `base_province`  (
  `id` bigint,
  `name` string,
  `region_id` string,
  `area_code` string,
  `iso_code` string
)  PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;

       5、 创建base_region 表,分区字段是 etl_date

CREATE TABLE `base_region`  (
  `id` string,
  `region_name` string
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;

       6、 创建order_info 表 分区字段是 etl_date

CREATE TABLE `order_info`  (
  `id` bigint,
  `consignee` string,
  `consignee_tel`string,
  `final_total_amount` decimal(16, 2),
  `order_status` string,
  `user_id` bigint,
  `delivery_address` string,
  `order_comment` string,
  `out_trade_no` string,
  `trade_body` string,
  `create_time` timestamp,
  `operate_time` timestamp,
  `expire_time` timestamp,
  `tracking_no` string,
  `parent_order_id` bigint,
  `img_url` string,
  `province_id` int,
  `benefit_reduce_amount` decimal(16, 2),
  `original_total_amount` decimal(16, 2),
  `feight_fee` decimal(16, 2)
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;

        7、创建order_detail 表 分区字段是 etl_date

CREATE TABLE `order_detail`  (
  `id` bigint,
  `order_id` bigint,
  `sku_id` bigint,
  `sku_name` string,
  `img_url` string,
  `order_price` decimal(10, 2),
  `sku_num` string,
  `create_time` timestamp,
  `source_type` string,
  `source_id` bigint
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;

任务 

 测试:建议先全量抽取一次mysql中的数据,然后再到mysql中新增几条时间戳最新的数据 再使用增量进行抽取。建议把里面sql自己多改试增加自己的理解。

1、抽取shtd_store库中user_info的增量数据进入Hive的ods库中表user_info。根据ods.user_info表中operate_time或create_time作为增量字段(即MySQL中每条数据取这两个时间中较大的那个时间作为增量字段去和ods里的这两个字段中较大的时间进行比较),只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.user_info命令。

  def main(args: Array[String]): Unit = {
    //添加hdfs用户名
    System.setProperty("HADOOP_user_name","root")
    System.setProperty("user.name","root")
    //创建
    val spark: SparkSession = SparkSession
      .builder()
      .appName("sparksql")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    //创建mysql对象
    val pro: Properties = new Properties()
    pro.put("user","root")
    pro.put("password","p@ssw0rd")
    //mysql地址
    val my = "jdbc:mysql://master:3306/shtd_store"
    //获取当前时间
    val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")

    //创建读取表
    val userInfo: Unit = spark.read.jdbc(my,"user_info",pro).createOrReplaceTempView("userInfo")


//    //全量抽取
//    spark.sql(
//      s"""insert overwrite ods.user_info
//        |partition(etl_date = $nowTime)
//        |select * from userInfo
//        |""".stripMargin)


//    //增量抽取
//      //获取增量字段
    val max = spark.sql("select if( create_time > operate_time,create_time,operate_time) as max1 from ods.user_info order by max1 desc limit 1").collect()(0).get(0).toString
    //      //抽取数据 user_info 数据

    spark.sql(
      s"""
        |insert into table ods.user_info
        |partition(etl_date = $nowTime)
        |select * from userInfo
        |where
        |create_time > cast('$max' as timestamp) or
        |operate_time > cast('$max' as timestamp)
        |""".stripMargin)

    //查看
    spark.sql("select * from ods.user_info").show()
    spark.sql("show partitions ods.user_info").show()

    spark.close()
  }

2、抽取shtd_store库中sku_info的增量数据进入Hive的ods库中表sku_info。根据ods.sku_info表中create_time作为增量字段,只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.sku_info命令 

def main(args: Array[String]): Unit = {
    //添加hdfs用户名
    System.setProperty("HADOOP_user_name","root")
    System.setProperty("user.name","root")
    //创建
    val spark: SparkSession = SparkSession
      .builder()
      .appName("sparksql")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    //创建mysql对象
    val pro: Properties = new Properties()
    pro.put("user","root")
    pro.put("password","p@ssw0rd")
    //mysql地址
    val my = "jdbc:mysql://master:3306/shtd_store"
    //获取当前时间
    val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")

    //创建读取表
    val skuInfo: Unit = spark.read.jdbc(my,"sku_info",pro).createOrReplaceTempView("skuInfo")
    //全量抽取
//    spark.sql(
//      s"""insert overwrite ods.sku_info
//        |partition(etl_date = $nowTime)
//        |select * from skuInfo
//        |""".stripMargin)

    //增量抽取
      //获取增量字段
    val max = spark.sql("select create_time as max from ods.sku_info order by max desc limit 1").collect()(0).get(0).toString
      //抽取数据数据

    spark.sql(
      s"""
        |insert into table ods.sku_info
        |partition(etl_date = $nowTime)
        |select * from skuInfo
        |where
        |create_time > cast('$max' as timestamp)
        |""".stripMargin)

    //查看
    spark.sql("select * from ods.sku_info").show()
    spark.sql("show partitions ods.sku_info").show()
    spark.close()
  }

3、抽取shtd_store库中base_province的增量数据进入Hive的ods库中表base_province。根据ods.base_province表中id作为增量字段,只将新增的数据抽入,字段名称、类型不变并添加字段create_time取当前时间,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.base_province命令

def main(args: Array[String]): Unit = {
    //添加hdfs用户名
    System.setProperty("HADOOP_user_name","root")
    System.setProperty("user.name","root")
    //创建
    val spark: SparkSession = SparkSession
      .builder()
      .appName("sparksql")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    //创建mysql对象
    val pro: Properties = new Properties()
    pro.put("user","root")
    pro.put("password","p@ssw0rd")
    //mysql地址
    val my = "jdbc:mysql://master:3306/shtd_store"
    //获取前一天时间
    val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
    //获取当天时间
    val Now: String = LocalDate.now().toString()

    //添加create_time字段
    //spark.sql(" alter table ods.base_province add columns(create_time timestamp)")

    //创建读取表
    val baseProvince: Unit = spark.read.jdbc(my,"base_province",pro).createOrReplaceTempView("baseProvince")
    //全量抽取
//    spark.sql(
//      s"""insert overwrite ods.base_province
//        |partition(etl_date = $nowTime)
//        |select
//        |id,
//        |name,
//        |region_id,
//        |area_code,
//        |iso_code,
//        |cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
//        |from baseProvince
//        |""".stripMargin)

    //增量抽取
      //获取增量字段
    val max = spark.sql("select id as max from ods.base_province order by max desc limit 1").collect()(0).get(0).toString

      //抽取数据数据
    spark.sql(
      s"""
        |insert into table ods.base_province partition(etl_date = $nowTime)
        |select
        |id,
        |name,
        |region_id,
        |area_code,
        |iso_code,
        |cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
        |from baseProvince
        |where
        |id > cast('$max' as int)
        |""".stripMargin)


    //查看
    spark.sql("select * from ods.base_province").show()
    spark.sql("show partitions ods.base_province").show()
    spark.close()
  }

4、抽取shtd_store库中base_region的增量数据进入Hive的ods库中表base_region。根据ods.base_region表中id作为增量字段,只将新增的数据抽入,字段名称、类型不变并添加字段create_time取当前时间,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.base_region命令 。

def main(args: Array[String]): Unit = {
    //添加hdfs用户名
    System.setProperty("HADOOP_user_name","root")
    System.setProperty("user.name","root")
    //创建
    val spark: SparkSession = SparkSession
      .builder()
      .appName("sparksql")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    //创建mysql对象
    val pro: Properties = new Properties()
    pro.put("user","root")
    pro.put("password","p@ssw0rd")
    //mysql地址
    val my = "jdbc:mysql://master:3306/shtd_store"
    //获取前一天时间
    val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
    //获取当天时间
    val Now: String = LocalDate.now().toString()

    //添加create_time字段
    //spark.sql(" alter table ods.base_region add columns(create_time timestamp)")

    //创建读取表
    val baseProvince: Unit = spark.read.jdbc(my,"base_region",pro).createOrReplaceTempView("baseRegion")
    //全量抽取
//    spark.sql(
//      s"""insert overwrite ods.base_region
//        |partition(etl_date = $nowTime)
//        |select
//        |id,
//        |region_name,
//        |cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
//        |from baseRegion
//        |""".stripMargin)

    //增量抽取
      //获取增量字段
    val max = spark.sql("select id from ods.base_region order by id desc limit 1").collect()(0).get(0).toString

      //抽取数据数据
    spark.sql(
      s"""
        |insert into table ods.base_region partition(etl_date = $nowTime)
        |select
        |id,
        |region_name,
        |cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
        |from baseRegion
        |where
        |cast(id as int) > cast('$max' as int)
        |""".stripMargin)


    //查看
    spark.sql("select * from ods.base_region").show()
    spark.sql("show partitions ods.base_region").show()
    spark.close()
  }

5、抽取shtd_store库中order_info的增量数据进入Hive的ods库中表order_info,根据ods.order_info表中operate_time或create_time作为增量字段(即MySQL中每条数据取这两个时间中较大的那个时间作为增量字段去和ods里的这两个字段中较大的时间进行比较),只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.order_info命令

def main(args: Array[String]): Unit = {
    //添加hdfs用户名
    System.setProperty("HADOOP_user_name","root")
    System.setProperty("user.name","root")
    //创建
    val spark: SparkSession = SparkSession
      .builder()
      .appName("sparksql")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    //创建mysql对象
    val pro: Properties = new Properties()
    pro.put("user","root")
    pro.put("password","p@ssw0rd")
    //mysql地址
    val my = "jdbc:mysql://master:3306/shtd_store"
    //获取前一天时间
    val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
    //获取当天时间
    val Now: String = LocalDate.now().toString()

    //创建读取表
    val baseProvince: Unit = spark.read.jdbc(my,"order_info",pro).createOrReplaceTempView("orderInfo")
    //全量抽取
//        spark.sql(
//          s"""insert overwrite ods.order_info
//            |partition(etl_date = $nowTime)
//            |select *
//            |from orderInfo
//            |""".stripMargin)

    //增量抽取
    //获取增量字段
    val max = spark.sql("select if(operate_time > create_time ,operate_time,create_time) as max from ods.order_info order by max desc limit 1").collect()(0).get(0).toString

      //抽取数据数据
    spark.sql(
      s"""
         |insert into table ods.order_info partition(etl_date = $nowTime)
         |select *
         |from orderInfo
         |where
         |operate_time > cast('$max' as timestamp) or
         |create_time > cast('$max' as timestamp)
         |""".stripMargin)


    //查看
    spark.sql("select * from ods.order_info").show()
    spark.sql("show partitions ods.order_info").show()
    spark.close()
  }

6、抽取shtd_store库中order_detail的增量数据进入Hive的ods库中表order_detail,根据ods.order_detail表中create_time作为增量字段,只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.order_detail命令 

def main(args: Array[String]): Unit = {
    //添加hdfs用户名
    System.setProperty("HADOOP_user_name","root")
    System.setProperty("user.name","root")
    //创建
    val spark: SparkSession = SparkSession
      .builder()
      .appName("sparksql")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    //创建mysql对象
    val pro: Properties = new Properties()
    pro.put("user","root")
    pro.put("password","p@ssw0rd")
    //mysql地址
    val my = "jdbc:mysql://master:3306/shtd_store"
    //获取前一天时间
    val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
    //获取当天时间
    val Now: String = LocalDate.now().toString()

    //创建读取表
    val baseProvince: Unit = spark.read.jdbc(my,"order_detail",pro).createOrReplaceTempView("orderDetail")
    //全量抽取
//        spark.sql(
//          s"""insert overwrite ods.order_detail
//            |partition(etl_date = $nowTime)
//            |select *
//            |from orderDetail
//            |""".stripMargin)

    //增量抽取
    //获取增量字段
    val max = spark.sql("select create_time as max from ods.order_detail order by max desc limit 1").collect()(0).get(0).toString

      //抽取数据数据
    spark.sql(
      s"""
         |insert into table ods.order_detail partition(etl_date = $nowTime)
         |select *
         |from orderDetail
         |where
         |create_time > cast('$max' as timestamp)
         |""".stripMargin)


    //查看
    spark.sql("select * from ods.order_detail").show()
    spark.sql("show partitions ods.order_detail").show()
    spark.close()
  }

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

open_test01

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值