SparkSQL综合案例练习2
数据集
- InvoiceNo 订单编号
- StockCode 产品代码
- Description 产品描述
- Quantity 数量
- InvoiceDate 日期
- UnitPrice 单价
- CustomerID 客户编号
- Country 国家名称
要求:
需求一: 统计各个国家有多少的客户量
需求二: 统计销量最高的10个国家
需求三: 各个国家的总销售额分布情况
需求四: 销售最高的10个商品
需求五: 商品描述的热门关键词TOP300
需求六: 统计退货订单数最多的10个国家
需求七: 商品的平均单价与销售的关系
需求八: 月销售额随时间的变化的趋势
需求九: 日销售随时间的变化趋势
需求十: 各个国家的购买订单量和退货订单量关系
清洗操作
过滤客户ID中为0的数据 以及将商品描述为空的数据,并改变时间格式,将结果存入HDFS
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
print("新零售案例实现: 清洗需求")
# 1. 创建SparkSession对象
spark = SparkSession\
.builder\
.master('local[*]')\
.appName('demo')\
.config('spark.sql.shuffle.partitions',4)\
.getOrCreate()
# 2- 读取外部文件数据集
df = spark.read.csv(
path='file:///export/data/Data.csv',
sep=',',
header=True,
inferSchema=True
)
df.printSchema()
df.show()
df = df.where('CustomerID is not null and CustomerID != 0 and Description is not null and Description != ""')
df = df.withColumn('InvoiceDate',F.from_unixtime(F.unix_timestamp('InvoiceDate','M/d/yyyy H:mm'),'yyyy-MM-dd HH:mm'))
df.write.csv(
path='hdfs://node1:8020/demo/output',
header=True,
sep='|',
encoding='GBK'
)
spark.stop()
指标统计
准备工作:将数据读取进入DataFrame
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
spark = SparkSession\
.builder\
.master('local[*]')\
.appName('xls')\
.config('spark.sql.shuffle.partitions',4)\
.getOrCreate()
df = spark.read.csv(
path='hdfs://node1:8020/xls/output',
sep='|',
encoding='GBK',
header=True,
inferSchema=True
)
df.createTempView('tab')
需求1:统计各个国家有多少的客户量
- SQL
spark.sql("""
select Country, count(distinct CustomerID) as cnt from tab group by Country order by cnt desc
""").show()
- DSL
df.groupby('Country').agg(
F.countDistinct('CustomerID').alias('c_cnt')
).orderBy('c_cnt', ascending=False).show()
需求2:统计销量最高的10个国家
- SQL
spark.sql("""
select Country, count(distinct InvoiceNo) as cnt from tab group by Country order by cnt desc limit 10
""").show()
- DSL
df.groupby('Country').agg(
F.countDistinct('InvoiceNo').alias('c_cnt')
).orderBy('c_cnt', ascending=False).limit(10).show()
需求3:各个国家的总销售额分布情况
- SQL
spark.sql("""
with total_tab as (
select
Country,
round(Quantity * UnitPrice,2) as total_price
from tab
)
select
Country, sum(total_price) as s_price from total_tab group by Country order by s_price desc
""").show()
- DSL
df_total = df.selectExpr("Country", "round(Quantity * UnitPrice,2) as total_price")
df_total.groupBy('Country').agg(F.round(F.sum('total_price').alias('s_price'),2).alias('s_price')).orderBy('s_price', ascending=False).show()
需求4:销售最高的10个商品
- SQL
spark.sql("""
with max_store as (
select
StockCode,
round(Quantity*UnitPrice, 2) as total_price
from tab
)
select
StockCode,
round(sum(total_price),2) as total_price
from max_store group by StockCode order by max_price desc limit 10
""").show()
- DSL
df_total = df.selectExpr("StockCode", "round(Quantity * UnitPrice,2) as total_price")
df_total.groupBy('StockCode').agg(F.round(F.sum('total_price').alias('total_price'),2).alias('total_price')).orderBy('total_price', ascending=False).limit(10).show()
需求5:商品描述的热门关键词TOP300
- SQL
spark.sql("""
with keywords as (
select explode(split(Description," ")) as keyword from tab
)
select keyword, count(1) as cnt from keywords group by keyword order by cnt desc limit 300
""").show()
- DSL
df_keywords = df.selectExpr('explode(split(Description," ")) as keyword')
df_keywords.groupBy('keyword').agg(F.count('keyword').alias('cnt')).orderBy('cnt', ascending=False).limit(300).show()
需求6:统计退货订单数最多的10个国家(C开头表示退货)
- SQL
spark.sql("""
select
country,
count(1) as cnt
from tab where InvoiceNo like 'C%'
group by country order by cnt desc limit 10
""").show()
- DSL
df.where('InvoiceNo like "C%"').groupBy('country').agg(F.count('country').alias('cnt')).orderBy('cnt', ascending=False).limit(10).show()
需求7:商品的平均单价与销售的关系
- SQL
spark.sql("""
select
StockCode,
avg(UnitPrice) as avg_price,
sum(Quantity) as sum_quantity
from tab
group by StockCode
""").show()
- DSL
df.groupBy('StockCode').agg(
F.avg('UnitPrice').alias('avg_price'),
F.sum('Quantity').alias('sum_quantity')
).show()
需求8:月销售额随时间的变化的趋势
- SQL
spark.sql("""
select
substr(InvoiceDate, 1, 7) as month,
round(sum(Quantity * UnitPrice),2) as total_price
from tab
where InvoiceNo not like 'C%'
group by month order by month
""").show()
- DSL
# 上面的题目可以按照下面的方式进行优化
df.where("InvoiceNo not like 'C%'").groupBy(F.substring('InvoiceDate', 1, 7).alias('month')).agg(
F.round(F.sum(F.col('Quantity') * F.col('UnitPrice')), 2).alias('total_price')
).orderBy('month').show()
需求9:日销售随时间的变化趋势
- SQL
spark.sql("""
select
substr(InvoiceDate, 1, 10) as day,
round(sum(Quantity * UnitPrice),2) as total_price
from tab
where InvoiceNo not like 'C%'
group by month order by day
""").show()
- DSL
df.where("InvoiceNo not like 'C%'").groupBy(F.substring('InvoiceDate', 1, 10).alias('day')).agg(
F.round(F.sum(F.col('Quantity') * F.col('UnitPrice')), 2).alias('total_price')
).orderBy('day').show()
需求10:各个国家的购买订单量和退货订单量关系
- SQL
spark.sql("""
select
Country,
count(distinct InvoiceNo) as o_cnt,
count( DISTINCT
if(InvoiceNo like 'C%',InvoiceNo,NULL)
) as t_cnt
from xls_tab group by Country
""").show()
- DSL
df.groupby('Country').agg(
F.countDistinct('InvoiceNo').alias('o_cnt'),
F.countDistinct(
F.expr("if(InvoiceNo like 'C%',InvoiceNo,NULL)")
).alias('t_cnt')
).show()