1.查询所有商品(sku_info表)截至到2021年10月01号的最新商品价格(需要结合价格修改表进行分析)
需求结果
2.所用的表和数据
--商品价格变更明细表
CREATE TABLE sku_price_modify_detail
(
`sku_id` string comment '商品id',
`new_price` decimal(16, 2) comment '更改后的价格',
`change_date` string comment '变动日期'
) COMMENT '商品价格变更明细表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--数据装载
insert overwrite table sku_price_modify_detail
values ('1', 1900, '2021-09-25'),
('1', 2000, '2021-09-26'),
('2', 80, '2021-09-29'),
('2', 10, '2021-09-30'),
('3', 4999, '2021-09-25'),
('3', 5000, '2021-09-26'),
('4', 5600, '2021-09-26'),
('4', 6000, '2021-09-27'),
('5', 490, '2021-09-27'),
('5', 500, '2021-09-28'),
('6', 1988, '2021-09-30'),
('6', 2000, '2021-10-01'),
('7', 88, '2021-09-28'),
('7', 100, '2021-09-29'),
('8', 800, '2021-09-28'),
('8', 600, '2021-09-29'),
('9', 1100, '2021-09-27'),
('9', 1000, '2021-09-28'),
('10', 90, '2021-10-01'),
('10', 100, '2021-10-02'),
('11', 66, '2021-10-01'),
('11', 50, '2021-10-02'),
('12', 35, '2021-09-28'),
('12', 20, '2021-09-29');
--商品信息表
CREATE TABLE sku_info
(
`sku_id` string COMMENT '商品id',
`name` string COMMENT '商品名称',
`category_id` string COMMENT '所属分类id',
`from_date` string COMMENT '上架日期',
`price` double COMMENT '商品单价'
) COMMENT '商品属性表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
--数据装载
insert overwrite table sku_info
values ('1', 'xiaomi 10', '1', '2020-01-01', 2000),
('2', '手机壳', '1', '2020-02-01', 10),
('3', 'apple 12', '1', '2020-03-01', 5000),
('4', 'xiaomi 13', '1', '2020-04-01', 6000),
('5', '破壁机', '2', '2020-01-01', 500),
('6', '洗碗机', '2', '2020-02-01', 2000),
('7', '热水壶', '2', '2020-03-01', 100),
('8', '微波炉', '2', '2020-04-01', 600),
('9', '自行车', '3', '2020-01-01', 1000),
('10', '帐篷', '3', '2020-02-01', 100),
('11', '烧烤架', '3', '2020-02-01', 50),
('12', '遮阳伞', '3', '2020-03-01', 20);
3.答案
思路:考察分组topN以及连接查询
3.1从价格变动表中筛选出小于2021-10-01之前的变动,并使用rank()开窗,以sku_id进行分区,以改变时间降序进行排序的到最新一次的更改价格记录
select sku_id, change_date, rank() over (partition by sku_id order by change_date desc) rk
from sku_price_modify_detail
where change_date <= '2021-10-01';
运行结果
3.2上述结果筛选出rk = 1的记录
select sku_id, new_price
from (select sku_id, change_date, new_price, rank() over (partition by sku_id order by change_date desc) rk
from sku_price_modify_detail
where change_date <= '2021-10-01') t1
where rk = 1;
运行结果
3.3sku_info与上述结果进行左连接,匹配上的使用new_price,匹配不上的使用price
select t2.sku_id, cast(nvl(new_price, price) as decimal(16,2)) price
from sku_info t2
left join (select sku_id, new_price
from (select sku_id,
change_date,
new_price,
rank() over (partition by sku_id order by change_date desc) rk
from sku_price_modify_detail
where change_date <= '2021-10-01') t1
where rk = 1) t3 on t2.sku_id = t3.sku_id;
这里如果你的spark job执行报错多半是关闭运算符时出错,改一下配置就行
#这里如果你spark job执行失败,客户端提示一下
--Spark job failed during runtime. Please check stacktrace for the root cause.
--具体报错
java.lang.IllegalStateException: Hit error while closing operators - failing tree: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: Error evaluating _col0
at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.close(SparkMapRecordHandler.java:197)
at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.closeRecordProcessor(HiveMapFunctionResultList.java:58)
at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:96)
at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.rdd.AsyncRDDActions.$anonfun$foreachAsync$2(AsyncRDDActions.scala:127)
at org.apache.spark.rdd.AsyncRDDActions.$anonfun$foreachAsync$2$adapted(AsyncRDDActions.scala:127)
at org.apache.spark.SparkContext.$anonfun$submitJob$1(SparkContext.scala:2377)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: Error evaluating _col0
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator.process(VectorMapJoinOuterStringOperator.java:447)
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939)
at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:158)
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939)
at org.apache.hadoop.hive.ql.exec.TableScanOperator.process(TableScanOperator.java:125)
at org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator.closeOp(VectorMapOperator.java:990)
at org.apache.hadoop.hive.ql.exec.Operator.close(Operator.java:733)
at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.close(SparkMapRecordHandler.java:174)
... 17 more
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Error evaluating _col0
at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:149)
at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:966)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:939)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinGenerateResultOperator.forwardBigTableBatch(VectorMapJoinGenerateResultOperator.java:640)
at org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator.process(VectorMapJoinOuterStringOperator.java:441)
... 26 more
Caused by: java.lang.ClassCastException: org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector cannot be cast to org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector
at org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector.setElement(DoubleColumnVector.java:213)
at org.apache.hadoop.hive.ql.exec.vector.expressions.VectorCoalesce.evaluate(VectorCoalesce.java:150)
at org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression.evaluateChildren(VectorExpression.java:271)
at org.apache.hadoop.hive.ql.exec.vector.expressions.FuncDoubleToDecimal.evaluate(FuncDoubleToDecimal.java:55)
at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:146)
... 30 more
--解决办法,关闭矢量化计算,官网写的默认关闭,不知道这里啥时候开启了,关了就行
set hive.vectorized.execution.enabled = false;
--具体官网参考
https://cwiki.apache.org/confluence/display/Hive/Vectorized+Query+Execution
运行结果