数据准备
准备数据文件
数据获取:https://download.csdn.net/download/dafsq/87635868?spm=1001.2014.3001.5501
在mysql中创建表和导入数据
数据导入完成在mysql中查看表
hive数据库和表的创建
1、创建ods数据库
create database ods;
2、创建user_info 表,分区字段是 etl_date
CREATE TABLE `user_info` (
`id` bigint,
`login_name` string,
`nick_name` string,
`passwd` string,
`name` string,
`phone_num` string,
`email` string,
`head_img` string,
`user_level` string,
`birthday` timestamp,
`gender` string,
`create_time` timestamp,
`operate_time` timestamp
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;
3、创建sku_info 表,分区字段也是 etl_date
CREATE TABLE `sku_info` (
`id` bigint,
`spu_id` bigint,
`price` decimal(10, 0),
`sku_name` string,
`sku_desc` string,
`weight` decimal(10, 2),
`tm_id` bigint,
`category3_id` bigint,
`sku_default_img` string,
`create_time` timestamp
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;
4、创建base_province 表,分区字段 etl_date
CREATE TABLE `base_province` (
`id` bigint,
`name` string,
`region_id` string,
`area_code` string,
`iso_code` string
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;
5、 创建base_region 表,分区字段是 etl_date
CREATE TABLE `base_region` (
`id` string,
`region_name` string
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;
6、 创建order_info 表 分区字段是 etl_date
CREATE TABLE `order_info` (
`id` bigint,
`consignee` string,
`consignee_tel`string,
`final_total_amount` decimal(16, 2),
`order_status` string,
`user_id` bigint,
`delivery_address` string,
`order_comment` string,
`out_trade_no` string,
`trade_body` string,
`create_time` timestamp,
`operate_time` timestamp,
`expire_time` timestamp,
`tracking_no` string,
`parent_order_id` bigint,
`img_url` string,
`province_id` int,
`benefit_reduce_amount` decimal(16, 2),
`original_total_amount` decimal(16, 2),
`feight_fee` decimal(16, 2)
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;
7、创建order_detail 表 分区字段是 etl_date
CREATE TABLE `order_detail` (
`id` bigint,
`order_id` bigint,
`sku_id` bigint,
`sku_name` string,
`img_url` string,
`order_price` decimal(10, 2),
`sku_num` string,
`create_time` timestamp,
`source_type` string,
`source_id` bigint
) PARTITIONED BY ( `etl_date` string)
row format delimited
fields terminated by "\001"
stored as textfile;
任务
测试:建议先全量抽取一次mysql中的数据,然后再到mysql中新增几条时间戳最新的数据 再使用增量进行抽取。建议把里面sql自己多改试增加自己的理解。
1、抽取shtd_store库中user_info的增量数据进入Hive的ods库中表user_info。根据ods.user_info表中operate_time或create_time作为增量字段(即MySQL中每条数据取这两个时间中较大的那个时间作为增量字段去和ods里的这两个字段中较大的时间进行比较),只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.user_info命令。
def main(args: Array[String]): Unit = {
//添加hdfs用户名
System.setProperty("HADOOP_user_name","root")
System.setProperty("user.name","root")
//创建
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//创建mysql对象
val pro: Properties = new Properties()
pro.put("user","root")
pro.put("password","p@ssw0rd")
//mysql地址
val my = "jdbc:mysql://master:3306/shtd_store"
//获取当前时间
val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
//创建读取表
val userInfo: Unit = spark.read.jdbc(my,"user_info",pro).createOrReplaceTempView("userInfo")
// //全量抽取
// spark.sql(
// s"""insert overwrite ods.user_info
// |partition(etl_date = $nowTime)
// |select * from userInfo
// |""".stripMargin)
// //增量抽取
// //获取增量字段
val max = spark.sql("select if( create_time > operate_time,create_time,operate_time) as max1 from ods.user_info order by max1 desc limit 1").collect()(0).get(0).toString
// //抽取数据 user_info 数据
spark.sql(
s"""
|insert into table ods.user_info
|partition(etl_date = $nowTime)
|select * from userInfo
|where
|create_time > cast('$max' as timestamp) or
|operate_time > cast('$max' as timestamp)
|""".stripMargin)
//查看
spark.sql("select * from ods.user_info").show()
spark.sql("show partitions ods.user_info").show()
spark.close()
}
2、抽取shtd_store库中sku_info的增量数据进入Hive的ods库中表sku_info。根据ods.sku_info表中create_time作为增量字段,只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.sku_info命令
def main(args: Array[String]): Unit = {
//添加hdfs用户名
System.setProperty("HADOOP_user_name","root")
System.setProperty("user.name","root")
//创建
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//创建mysql对象
val pro: Properties = new Properties()
pro.put("user","root")
pro.put("password","p@ssw0rd")
//mysql地址
val my = "jdbc:mysql://master:3306/shtd_store"
//获取当前时间
val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
//创建读取表
val skuInfo: Unit = spark.read.jdbc(my,"sku_info",pro).createOrReplaceTempView("skuInfo")
//全量抽取
// spark.sql(
// s"""insert overwrite ods.sku_info
// |partition(etl_date = $nowTime)
// |select * from skuInfo
// |""".stripMargin)
//增量抽取
//获取增量字段
val max = spark.sql("select create_time as max from ods.sku_info order by max desc limit 1").collect()(0).get(0).toString
//抽取数据数据
spark.sql(
s"""
|insert into table ods.sku_info
|partition(etl_date = $nowTime)
|select * from skuInfo
|where
|create_time > cast('$max' as timestamp)
|""".stripMargin)
//查看
spark.sql("select * from ods.sku_info").show()
spark.sql("show partitions ods.sku_info").show()
spark.close()
}
3、抽取shtd_store库中base_province的增量数据进入Hive的ods库中表base_province。根据ods.base_province表中id作为增量字段,只将新增的数据抽入,字段名称、类型不变并添加字段create_time取当前时间,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.base_province命令
def main(args: Array[String]): Unit = {
//添加hdfs用户名
System.setProperty("HADOOP_user_name","root")
System.setProperty("user.name","root")
//创建
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//创建mysql对象
val pro: Properties = new Properties()
pro.put("user","root")
pro.put("password","p@ssw0rd")
//mysql地址
val my = "jdbc:mysql://master:3306/shtd_store"
//获取前一天时间
val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
//获取当天时间
val Now: String = LocalDate.now().toString()
//添加create_time字段
//spark.sql(" alter table ods.base_province add columns(create_time timestamp)")
//创建读取表
val baseProvince: Unit = spark.read.jdbc(my,"base_province",pro).createOrReplaceTempView("baseProvince")
//全量抽取
// spark.sql(
// s"""insert overwrite ods.base_province
// |partition(etl_date = $nowTime)
// |select
// |id,
// |name,
// |region_id,
// |area_code,
// |iso_code,
// |cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
// |from baseProvince
// |""".stripMargin)
//增量抽取
//获取增量字段
val max = spark.sql("select id as max from ods.base_province order by max desc limit 1").collect()(0).get(0).toString
//抽取数据数据
spark.sql(
s"""
|insert into table ods.base_province partition(etl_date = $nowTime)
|select
|id,
|name,
|region_id,
|area_code,
|iso_code,
|cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
|from baseProvince
|where
|id > cast('$max' as int)
|""".stripMargin)
//查看
spark.sql("select * from ods.base_province").show()
spark.sql("show partitions ods.base_province").show()
spark.close()
}
4、抽取shtd_store库中base_region的增量数据进入Hive的ods库中表base_region。根据ods.base_region表中id作为增量字段,只将新增的数据抽入,字段名称、类型不变并添加字段create_time取当前时间,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.base_region命令 。
def main(args: Array[String]): Unit = {
//添加hdfs用户名
System.setProperty("HADOOP_user_name","root")
System.setProperty("user.name","root")
//创建
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//创建mysql对象
val pro: Properties = new Properties()
pro.put("user","root")
pro.put("password","p@ssw0rd")
//mysql地址
val my = "jdbc:mysql://master:3306/shtd_store"
//获取前一天时间
val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
//获取当天时间
val Now: String = LocalDate.now().toString()
//添加create_time字段
//spark.sql(" alter table ods.base_region add columns(create_time timestamp)")
//创建读取表
val baseProvince: Unit = spark.read.jdbc(my,"base_region",pro).createOrReplaceTempView("baseRegion")
//全量抽取
// spark.sql(
// s"""insert overwrite ods.base_region
// |partition(etl_date = $nowTime)
// |select
// |id,
// |region_name,
// |cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
// |from baseRegion
// |""".stripMargin)
//增量抽取
//获取增量字段
val max = spark.sql("select id from ods.base_region order by id desc limit 1").collect()(0).get(0).toString
//抽取数据数据
spark.sql(
s"""
|insert into table ods.base_region partition(etl_date = $nowTime)
|select
|id,
|region_name,
|cast(date_format(current_timestamp(), 'yyyy-MM-dd HH:mm:ss') as timestamp) create_time
|from baseRegion
|where
|cast(id as int) > cast('$max' as int)
|""".stripMargin)
//查看
spark.sql("select * from ods.base_region").show()
spark.sql("show partitions ods.base_region").show()
spark.close()
}
5、抽取shtd_store库中order_info的增量数据进入Hive的ods库中表order_info,根据ods.order_info表中operate_time或create_time作为增量字段(即MySQL中每条数据取这两个时间中较大的那个时间作为增量字段去和ods里的这两个字段中较大的时间进行比较),只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.order_info命令
def main(args: Array[String]): Unit = {
//添加hdfs用户名
System.setProperty("HADOOP_user_name","root")
System.setProperty("user.name","root")
//创建
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//创建mysql对象
val pro: Properties = new Properties()
pro.put("user","root")
pro.put("password","p@ssw0rd")
//mysql地址
val my = "jdbc:mysql://master:3306/shtd_store"
//获取前一天时间
val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
//获取当天时间
val Now: String = LocalDate.now().toString()
//创建读取表
val baseProvince: Unit = spark.read.jdbc(my,"order_info",pro).createOrReplaceTempView("orderInfo")
//全量抽取
// spark.sql(
// s"""insert overwrite ods.order_info
// |partition(etl_date = $nowTime)
// |select *
// |from orderInfo
// |""".stripMargin)
//增量抽取
//获取增量字段
val max = spark.sql("select if(operate_time > create_time ,operate_time,create_time) as max from ods.order_info order by max desc limit 1").collect()(0).get(0).toString
//抽取数据数据
spark.sql(
s"""
|insert into table ods.order_info partition(etl_date = $nowTime)
|select *
|from orderInfo
|where
|operate_time > cast('$max' as timestamp) or
|create_time > cast('$max' as timestamp)
|""".stripMargin)
//查看
spark.sql("select * from ods.order_info").show()
spark.sql("show partitions ods.order_info").show()
spark.close()
}
6、抽取shtd_store库中order_detail的增量数据进入Hive的ods库中表order_detail,根据ods.order_detail表中create_time作为增量字段,只将新增的数据抽入,字段名称、类型不变,同时添加静态分区,分区字段类型为String,且值为当前练习时间的前一天日期(分区字段格式为yyyyMMdd)。使用hive cli执行show partitions ods.order_detail命令
def main(args: Array[String]): Unit = {
//添加hdfs用户名
System.setProperty("HADOOP_user_name","root")
System.setProperty("user.name","root")
//创建
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
//创建mysql对象
val pro: Properties = new Properties()
pro.put("user","root")
pro.put("password","p@ssw0rd")
//mysql地址
val my = "jdbc:mysql://master:3306/shtd_store"
//获取前一天时间
val nowTime: String = LocalDate.now().plusDays(-1).toString().replace("-","")
//获取当天时间
val Now: String = LocalDate.now().toString()
//创建读取表
val baseProvince: Unit = spark.read.jdbc(my,"order_detail",pro).createOrReplaceTempView("orderDetail")
//全量抽取
// spark.sql(
// s"""insert overwrite ods.order_detail
// |partition(etl_date = $nowTime)
// |select *
// |from orderDetail
// |""".stripMargin)
//增量抽取
//获取增量字段
val max = spark.sql("select create_time as max from ods.order_detail order by max desc limit 1").collect()(0).get(0).toString
//抽取数据数据
spark.sql(
s"""
|insert into table ods.order_detail partition(etl_date = $nowTime)
|select *
|from orderDetail
|where
|create_time > cast('$max' as timestamp)
|""".stripMargin)
//查看
spark.sql("select * from ods.order_detail").show()
spark.sql("show partitions ods.order_detail").show()
spark.close()
}