楔子
看到一个贴吧里面有个SQL,求连续得到冠军的 球队和年份
http://tieba.baidu.com/p/1550921098
要求
写一句SQL得到连续夺冠的队伍和起止年份
数据
team,y
活塞,1990
公牛,1991
公牛,1992
公牛,1993
火箭,1994
火箭,1995
公牛,1996
公牛,1997
公牛,1998
马刺,1999
湖人,2000
湖人,2001
湖人,2002
马刺,2003
活塞,2004
马刺,2005
热火,2006
马刺,2007
凯尔特人,2008
湖人,2009
湖人,2010
结果
+----+------+------+
|team|min(y)|max(y)|
+----+------+------+
| 公牛| 1991| 1993|
| 火箭| 1994| 1995|
| 公牛| 1996| 1998|
| 湖人| 2000| 2002|
| 湖人| 2009| 2010|
+----+------+------+
求解
val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.master("local")
.config("spark.some.config.option", "some-value")
.getOrCreate()
def orderByFor() = {
val df = spark.read
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") //缺少这个会提示 错误 java.lang.IllegalArgumentException: Illegal pattern component: XXX
.option("header", true) // 第一行作为 头
.option("delimiter", ",") //分隔符
.option("inferschema", true) //自动类型推断
.csv("d://ord.csv")
df.createOrReplaceTempView("v_data")
/**
* step 1 根据 队名分组 分组 对年份正序排序
*/
spark.sql(
"""
|select team,
| y,
| row_number() over(partition by team order by y) as y_row_number,
| y - row_number() over(partition by team order by y) as next_y
| from v_data
""".stripMargin) //.show()
/**
* setp 2 年份 减去 排序 得到的值(next_y)如果相同,就表示 这两个年份紧挨者 ,表示 该球队 连续夺冠
*/
spark.sql(
"""
|select team, y, next_y
| from (select team,
| y,
| row_number() over(partition by team order by y) as y_row_number,
| y - row_number() over(partition by team order by y) as next_y
| from v_data) v
""".stripMargin) //.show()
/**
* step 3 根据 第二部的差值(next_y) 和球队名 对 年份求最大值和最小值
*/
spark.sql(
"""
|select team, min(y), max(y), next_y
| from (select team, y,
| row_number() over(partition by team order by y) as y_row_number,
| y - row_number() over(partition by team order by y) as next_y
| from v_data)
|group by team, next_y
""".stripMargin)//.show()
/**
* 过滤掉 最大值和最小相等的 ,也就是没有连续的
*/
spark.sql(
"""
|select team, min(y), max(y)
| from (select team, y,
| row_number() over(partition by team order by y) as y_row_number,
| y - row_number() over(partition by team order by y) as next_y
| from v_data)
|group by team, next_y having min(y)<max(y) order by 2
""".stripMargin).show()
}
排序
三个排序函数
row_number 、dense_rank 、 rank
*/
def orderBy() = {
// spark 几个排序
val schema = StructType(
StructField("cookid", StringType, false) ::
StructField("pv", IntegerType, false) :: Nil
)
val df = spark.read
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") //缺少这个会提示 错误 java.lang.IllegalArgumentException: Illegal pattern component: XXX
.option("delimiter", ",") //分隔符
.option("inferschema", true) //自动类型推断
.schema(schema)
.csv("d://orderby.csv")
df.createOrReplaceTempView("v_orderby")
spark.sql(
"""
select cookid,pv,
row_number() over (partition by cookid order by pv desc ) as pv_row_number,
dense_rank() over (partition by cookid order by pv desc ) as pv_dense_rank,
rank() over (partition by cookid order by pv desc ) as pv_rank
from v_orderby
""".stripMargin).show()
}