文章目录
一.问题描述
有这么一个类通讯录的数据集如下截图:
我想得到共同好友的清单
希望得到的输出类似如下:
(B,C) -->(A)
(F,G) -->(B,C)
这个场景只是一个简单的数据场景,真实的场景会比这个复杂很多。
二.解决方案
2.1 梳理解决方案
首先我们不要把问题想得太复杂,我们只需要求两个人的共同好友,并没有要求更多的,例如3个人甚至更多的。
第一步
根据col1列进行聚合,类似于mysql的group_concat功能。
第二步
根据合并后的col2列,进行两两拆分,如下图所示:
这样就可以拆分出 B,C有共同好友A。
第三步
根据col-new2列进行分组,将col1列进行聚合,类似于mysql的group_concat功能。
2.2 整理代码思路
上一节第一步和第三步同关系型数据库的group_concat功能,spark的rdd通过groupBykey可以直接实现。
难点在于第二步。
好在python强大的第三方库,真的不要太给力。
python的itertools库的combinations函数可以直接实现此功能。
语法:
combinations(p,r)
从p中找出所有长度为r的排列情况… 有顺序
测试记录:
>>> from itertools import combinations
>>>
>>> print(list(combinations([1,2,3,4,5,6],2)))
[(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (2, 3), (2, 4), (2, 5), (2, 6), (3, 4), (3, 5), (3, 6), (4, 5), (4, 6), (5, 6)]
>>> print(list(combinations([1,2,3,4,5,6],3)))
[(1, 2, 3), (1, 2, 4), (1, 2, 5), (1, 2, 6), (1, 3, 4), (1, 3, 5), (1, 3, 6), (1, 4, 5), (1, 4, 6), (1, 5, 6), (2, 3, 4), (2, 3, 5), (2, 3, 6), (2, 4, 5), (2, 4
, 6), (2, 5, 6), (3, 4, 5), (3, 4, 6), (3, 5, 6), (4, 5, 6)]
>>>
>>> print(list(combinations([1,2,3,4,5,6],4)))
[(1, 2, 3, 4), (1, 2, 3, 5), (1, 2, 3, 6), (1, 2, 4, 5), (1, 2, 4, 6), (1, 2, 5, 6), (1, 3, 4, 5), (1, 3, 4, 6), (1, 3, 5, 6), (1, 4, 5, 6), (2, 3, 4, 5), (2, 3
, 4, 6), (2, 3, 5, 6), (2, 4, 5, 6), (3, 4, 5, 6)]
>>>
>>>
>>> print(list(combinations([1,2,3,4,5,6],5)))
[(1, 2, 3, 4, 5), (1, 2, 3, 4, 6), (1, 2, 3, 5, 6), (1, 2, 4, 5, 6), (1, 3, 4, 5, 6), (2, 3, 4, 5, 6)]
>>>
>>>
>>> print(list(combinations([1,2,3,4,5,6],6)))
[(1, 2, 3, 4, 5, 6)]
>>>
2.3 最终解决方案
2.3.1 PySpark RDD解决方案
代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyspark import SparkContext, SparkConf
from itertools import combinations
from itertools import combinations
def combine_friends(key,values):
iters = combinations(list(values),2)
for us in iters:
yield(us,key)
spark_conf = SparkConf().setMaster("local[1]").setAppName("Find-Common-Friends")
sc = sc=SparkContext.getOrCreate(spark_conf)
rdd=sc.parallelize([('A','B'),('A','C'),('A','D'),('A','E'),('A','F'),('B','D'),('B','E'),('B','F'),('B','G'),('C','F'),('C','G'),('C','H')])
rdd1= rdd.groupByKey()
# flatMap 一对多的操作
rdd2=rdd1.flatMap(lambda x:combine_friends(x[0],x[1]))
rdd3=rdd2.groupByKey()
# 需要list一下,不然groupBykey输出的是集合而非列表
rdd4=rdd3.mapValues(list)
#print(rdd4.collect())
# 保存到文件
rdd4.coalesce(1).saveAsTextFile("file:///home/pyspark/output1.csv")
sc.stop()
测试记录:
[root@hp2 output1.csv]# more part-00000
(('C', 'F'), ['A'])
(('B', 'C'), ['A'])
(('E', 'G'), ['B'])
(('D', 'G'), ['B'])
(('F', 'G'), ['C', 'B'])
(('D', 'F'), ['A', 'B'])
(('E', 'F'), ['A', 'B'])
(('C', 'E'), ['A'])
(('D', 'E'), ['A', 'B'])
(('B', 'F'), ['A'])
(('G', 'H'), ['C'])
(('B', 'D'), ['A'])
(('B', 'E'), ['A'])
(('C', 'D'), ['A'])
(('F', 'H'), ['C'])
[root@hp2 output1.csv]#
2.3.2 PySpark Spark SQL解决方案
上述需求其实通过Spark SQL也可以解决。
将数据录入到Hive
create table test1(id1 string,id2 string);
insert into test1 values ('A','B');
insert into test1 values ('A','C');
insert into test1 values ('A','D');
insert into test1 values ('A','E');
insert into test1 values ('A','F');
insert into test1 values ('B','D');
insert into test1 values ('B','E');
insert into test1 values ('B','F');
insert into test1 values ('B','G');
insert into test1 values ('C','F');
insert into test1 values ('C','G');
insert into test1 values ('C','H');
代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
# 创建一个连接
spark = SparkSession. \
Builder(). \
appName('sql'). \
master('local'). \
getOrCreate()
# spark.sql("use test")
# 拼接sql语句
hive_sql1 = "create table tmp1 as " \
"SELECT tmp1.id1, " \
" tmp1.id2 id2_1, " \
" tmp2.id2 id2_2" \
" from test1 tmp1 " \
" inner join test1 tmp2 " \
" on tmp1.id1 = tmp2.id1 " \
" where tmp1.id2 < tmp2.id2 "
hive_sql2 ="create table tmp2 as " \
"select concat(id2_1,',',id2_2) as id2_new, " \
" concat_ws(',', collect_set(id1)) as id1_new " \
" from tmp1 " \
" group by concat(id2_1,',',id2_2) "
# 运行sql语句
df1 = spark.sql(hive_sql1)
df2 = spark.sql(hive_sql2)
# 关闭spark回话
spark.stop()
测试记录:
hive>
> select * from tmp1;
OK
tmp1.id1 tmp1.id2_1 tmp1.id2_2
A B F
A B E
A B D
A B C
A C F
A C E
A C D
C G H
A D F
A D E
A E F
B D G
B D F
B D E
B E G
B E F
B F G
C F H
C F G
Time taken: 1.779 seconds, Fetched: 19 row(s)
hive> select * from tmp2;
OK
tmp2.id2_new tmp2.id1_new
F,H C
B,C A
C,E A
B,E A
C,D A
E,F B,A
G,H C
B,D A
B,F A
D,E B,A
E,G B
C,F A
F,G C,B
D,G B
D,F B,A
Time taken: 0.21 seconds, Fetched: 15 row(s)
hive>
2.3.3 PySpark DataFrame解决方案
我们同样可以使用DataFrame来解决这个问题
将数据录入到Hive
create table test1(id1 string,id2 string);
insert into test1 values ('A','B');
insert into test1 values ('A','C');
insert into test1 values ('A','D');
insert into test1 values ('A','E');
insert into test1 values ('A','F');
insert into test1 values ('B','D');
insert into test1 values ('B','E');
insert into test1 values ('B','F');
insert into test1 values ('B','G');
insert into test1 values ('C','F');
insert into test1 values ('C','G');
insert into test1 values ('C','H');
代码:
备注: fun_concat 为了测试自定义函数,其实可以使用concat代替
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql.functions import udf,collect_list
from pyspark.sql.types import StringType
# 创建一个连接
spark = SparkSession. \
Builder(). \
appName('sql'). \
master('local'). \
getOrCreate()
sc = spark.sparkContext
def fun_concat(x, y):
return str(x) + "," + str(y)
udf_concat = udf(fun_concat,StringType())
# spark.sql执行默认是取值Hive表,类型是DataFrame类型
# spark.sql("use test")
df1 = spark.sql("select id1,id2 as id2_1 from test1")
df2 = spark.sql("select id1,id2 as id2_2 from test1")
df3 = df1.join(df2,df1.id1 == df2.id1 ,'inner').select(df1.id1, df1.id2_1, df2.id2_2)
df4 = df3.select("id1", "id2_1", "id2_2").where(" id2_1 < id2_2 ")
df5 = df4.withColumn("id2_new",udf_concat(df4['id2_1'],df4['id2_2']))
df6 = df5.drop('id2_1','id2_2')
df7 = df6.groupby('id2_new').agg(collect_list(df6["id1"]).alias("id1_new"))
df7.show()
测试记录: