直接上代码:
粘贴格式很乱,可以直接点击我的印象笔记:
https://app.yinxiang.com/shard/s60/nl/11100943/831e454b-b206-4b3f-93fb-00aa1e357172
1.建立spark app
from pyspark.sql import SparkSession
builder = SparkSession.builder
builder = builder.config("spark.executor.memory", "60G")
builder = builder.config("spark.driver.memory", "10G")
builder = builder.config("spark.driver.maxResultSize", "5G")
sc=SparkSession\
.builder\
.appName("RDD_and_DataFrame") \
.config("spark.some.config.option", "some-value") \
.enableHiveSupport()\
.getOrCreate()
#读取sql
df= sc.sql("")
df.take(5)
#splite 切片
df=df.map(lambda line:(int(line.split()[1]),1))
df.take(5)
#reduceBykey
df=df.reduceByKey(lambda x,y:x+y)
df.take(10)
#lambda 切片
paixu=df.map(lambda x:(x[1],x[0]))
paixu.take(10)
#排序
paixu=paixu.sortByKey(ascending=0)
paixu.take(10)
#rdd转换成df rdd.toDF
rdd1= sc.sparkContext.textFile("./movie_comment.data")
rdd1.take(2)
from collections import namedtuple
col=namedtuple('col',['a','b','c','d'])
def line_to_col(line):
cells=line.split('\t')
return col(cells[0],cells[1],cells[2],cells[3])
rdd2=rdd1.map(line_to_col)
rdd2.take(10)
df2=sc.createDataFrame(rdd2)
df2.show()
executed in 145ms, finished 12:33:19 2018-09-16
+---+----+---+---------+
| a| b| c| d|
+---+----+---+---------+
|196| 242| 3|881250949|
|186| 302| 3|891717742|
| 22| 377| 1|878887116|
|244| 51| 2|880606923|
|166| 346| 1|886397596|
|298| 474| 4|884182806|
|115| 265| 2|881171488|
|253| 465| 5|891628467|
|305| 451| 3|886324817|
| 6| 86| 3|883603013|
| 62| 257| 2|879372434|
|286|1014| 5|879781125|
|200| 222| 5|876042340|
|210| 40| 3|891035994|
|224| 29| 3|888104457|
|303| 785| 3|879485318|
|122| 387| 5|879270459|
|194| 274| 2|879539794|
|291|1042| 4|874834944|
|234|1184| 2|892079237|
+---+----+---+---------+
only showing top 20 rows
#修改列名字
df2= df2.withColumnRenamed('a','a2')\
.withColumnRenamed('b','b2')\
.withColumnRenamed('c','c2')\
.withColumnRenamed('d','d2')
df2.show(5)
executed in 77ms, finished 12:33:19 2018-09-16
+---+---+---+---------+
| a2| b2| c2| d2|
+---+---+---+---------+
|196|242| 3|881250949|
|186|302| 3|891717742|
| 22|377| 1|878887116|
|244| 51| 2|880606923|
|166|346| 1|886397596|
+---+---+---+---------+
only showing top 5 rows
#注册temptable regesterTempTable
df2.registerTempTable('t1')
df3=sc.sql("select a2,case when b2>300 then 'a' else 'b' end as juadge,d2 from t1")
df3.show(5)
executed in 107ms, finished 12:33:19 2018-09-16
+---+------+---------+
| a2|juadge| d2|
+---+------+---------+
|196| b|881250949|
|186| a|891717742|
| 22| a|878887116|
|244| b|880606923|
|166| a|886397596|
+---+------+---------+
only showing top 5 rows
#去重复
df2=df2.drop_duplicates(['a2','d2'])
df2.show(10)
executed in 1.90s, finished 12:33:21 2018-09-16
+---+----+---+---------+
| a2| b2| c2| d2|
+---+----+---+---------+
| 1| 103| 1|878542845|
|102| 195| 4|888801360|
|110|1248| 3|886989126|
|117| 179| 5|881012776|
|117| 751| 5|886018996|
| 12| 381| 4|879958902|
|125| 300| 5|892835836|
| 13| 21| 3|882399040|
|138| 100| 5|879022956|
|141| 50| 4|884584735|
+---+----+---+---------+
only showing top 10 rows
#join 默认是inner join
con=[df2.a2==df3.a2,df2.d2==df3.d2]
df2.join(df3,con,'left').select(df2.a2,df2.d2,df3.juadge).show()
executed in 1.24s, finished 12:33:22 2018-09-16
+---+---------+------+
| a2| d2|juadge|
+---+---------+------+
| 1|878542845| b|
|102|888801360| b|
|110|886989126| a|
|117|881012776| b|
|117|886018996| a|
| 12|879958902| a|
|125|892835836| b|
| 13|882399040| b|
|138|879022956| b|
|141|884584735| b|
|146|891457493| a|
|151|879524563| a|
|152|882899511| a|
|158|880134607| b|
| 16|877718755| b|
|172|875538028| a|
| 18|880132129| b|
|182|885613238| b|
|184|889912691| a|
|194|879521253| b|
+---+---------+------+
only showing top 20 rows
#left join
df4=df2.join(df3,[df2.a2==df3.a2,df2.d2==df3.d2],'left').select(df2.a2,df2.b2,df2.c2,df2.d2,df3.juadge)
df4.show(5)
df4.printSchema()
executed in 841ms, finished 12:33:23 2018-09-16
+---+----+---+---------+------+
| a2| b2| c2| d2|juadge|
+---+----+---+---------+------+
| 1| 103| 1|878542845| b|
|102| 195| 4|888801360| b|
|110|1248| 3|886989126| a|
|117| 179| 5|881012776| b|
|117| 751| 5|886018996| a|
+---+----+---+---------+------+
only showing top 5 rows
root
|-- a2: string (nullable = true)
|-- b2: string (nullable = true)
|-- c2: string (nullable = true)
|-- d2: string (nullable = true)
|-- juadge: string (nullable = true)
#改变数值类型
df4=df4.withColumn("c2",df4.b2.cast("int"))
df4.printSchema()
df4.show()
executed in 935ms, finished 12:33:24 2018-09-16
root
|-- a2: string (nullable = true)
|-- b2: string (nullable = true)
|-- c2: integer (nullable = true)
|-- d2: string (nullable = true)
|-- juadge: string (nullable = true)
+---+----+----+---------+------+
| a2| b2| c2| d2|juadge|
+---+----+----+---------+------+
| 1| 103| 103|878542845| b|
|102| 195| 195|888801360| b|
|110|1248|1248|886989126| a|
|117| 179| 179|881012776| b|
|117| 751| 751|886018996| a|
| 12| 381| 381|879958902| a|
|125| 300| 300|892835836| b|
| 13| 21| 21|882399040| b|
|138| 100| 100|879022956| b|
|141| 50| 50|884584735| b|
|146| 347| 347|891457493| a|
|151| 484| 484|879524563| a|
|152| 423| 423|882899511| a|
|158| 38| 38|880134607| b|
| 16| 282| 282|877718755| b|
|172| 580| 580|875538028| a|
| 18| 32| 32|880132129| b|
|182| 111| 111|885613238| b|
|184| 412| 412|889912691| a|
|194| 192| 192|879521253| b|
+---+----+----+---------+------+
only showing top 20 rows
#groupby 跟pandas 一模一样
df_groupby=df4.groupBy(['a2']).agg({'b2':"sum",'d2':'avg'})
df_groupby.cache() #cache一下以后数据会更快
df_groupby.show()
executed in 7.18s, finished 12:33:31 2018-09-16
+---+-------+-------------------+
| a2|sum(b2)| avg(d2)|
+---+-------+-------------------+
|675|15082.0|8.894891694166666E8|
|829|16325.0|8.863944126086956E8|
|467| 9521.0|8.795324894230769E8|
|691| 3220.0|8.755431319166666E8|
|296|19379.0|8.841969246734694E8|
|125|37475.0|8.884001631481482E8|
|451|12895.0|8.790126404347826E8|
|800| 5850.0|8.876464539473684E8|
|853| 7525.0| 8.793649598E8|
|666|43888.0|8.803274312075472E8|
|870|77522.0| 8.79217436918239E8|
|919|50000.0|8.769741582150538E8|
|926| 2348.0| 8.8849394925E8|
| 7|65775.0|8.915557552117647E8|
|447|26936.0|8.788553880547945E8|
|591|12799.0| 8.91033606E8|
| 51| 2464.0| 8.834987848E8|
|124| 2163.0|8.903035151428572E8|
|718|15475.0| 8.8335309684E8|
|307|25822.0|8.782779504473684E8|
+---+-------+-------------------+
only showing top 20 rows
#重命名列
#df5=df_groupby.withColumnRenamed(['sum(b2)'],["b2_sum"])
df5=df_groupby.withColumnRenamed('sum(b2)',"b2_sum")
df5=df5.withColumnRenamed('avg(d2)',"d2_avg")
df5.show()
executed in 45ms, finished 12:33:31 2018-09-16
+---+-------+-------------------+
| a2| b2_sum| d2_avg|
+---+-------+-------------------+
|675|15082.0|8.894891694166666E8|
|829|16325.0|8.863944126086956E8|
|467| 9521.0|8.795324894230769E8|
|691| 3220.0|8.755431319166666E8|
|296|19379.0|8.841969246734694E8|
|125|37475.0|8.884001631481482E8|
|451|12895.0|8.790126404347826E8|
|800| 5850.0|8.876464539473684E8|
|853| 7525.0| 8.793649598E8|
|666|43888.0|8.803274312075472E8|
|870|77522.0| 8.79217436918239E8|
|919|50000.0|8.769741582150538E8|
|926| 2348.0| 8.8849394925E8|
| 7|65775.0|8.915557552117647E8|
|447|26936.0|8.788553880547945E8|
|591|12799.0| 8.91033606E8|
| 51| 2464.0| 8.834987848E8|
|124| 2163.0|8.903035151428572E8|
|718|15475.0| 8.8335309684E8|
|307|25822.0|8.782779504473684E8|
+---+-------+-------------------+
only showing top 20 rows
#排序 跟pandas 也一模一样
df5=df5.sort(['b2_sum','d2_avg'],ascending=[1,0])
df5.show(4)
executed in 754ms, finished 12:33:32 2018-09-16
+---+------+-------------------+
| a2|b2_sum| d2_avg|
+---+------+-------------------+
|604|1115.0| 8.83668142E8|
|571|1558.0|8.833548885714285E8|
|309|1568.0| 8.77370353E8|
|583|1643.0|8.793843508571428E8|
+---+------+-------------------+
only showing top 4 rows
#造数据做下面的操作
#spark createdatframe 数值类型必须要一样
df5=sc.createDataFrame([('凤姐','乔峰',44,1,1,'girl'),
('李宇春','慕容复',43,22,3,'girl'),
('柳岩','鸠摩智',21,6,2,'boy'),
('康敏','星宿老仙',43,55,2,'girl'),
('苍老师','段延庆',0,44,1,'boy'),
('小泽','段正淳',43,0,2,'boy'),
('波多野结衣','摘星子',65,23,1,'girl'),
('木婉清','段誉',65,23,1,'girl'),
('李莫愁','王重阳',25,23,2,'boy')],
['girl_friend','name','salary','score','class','sex'])
df5.show()
executed in 193ms, finished 12:33:32 2018-09-16
+-----------+----+------+-----+-----+----+
|girl_friend|name|salary|score|class| sex|
+-----------+----+------+-----+-----+----+
| 凤姐| 乔峰| 44| 1| 1|girl|
| 李宇春| 慕容复| 43| 22| 3|girl|
| 柳岩| 鸠摩智| 21| 6| 2| boy|
| 康敏|星宿老仙| 43| 55| 2|girl|
| 苍老师| 段延庆| 0| 44| 1| boy|
| 小泽| 段正淳| 43| 0| 2| boy|
| 波多野结衣| 摘星子| 65| 23| 1|girl|
| 木婉清| 段誉| 65| 23| 1|girl|
| 李莫愁| 王重阳| 25| 23| 2| boy|
+-----------+----+------+-----+-----+----+
In [70]:
#选额列
from pyspark.sql import functions as func
df51=df5.select('name','girl_friend','salary','score')
df51=df5.select('*',(df5.score + 10).alias('agePlusTen'))
df51.show()
executed in 218ms, finished 12:33:32 2018-09-16
+-----------+----+------+-----+-----+----+----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|
+-----------+----+------+-----+-----+----+----------+
| 凤姐| 乔峰| 44| 1| 1|girl| 11|
| 李宇春| 慕容复| 43| 22| 3|girl| 32|
| 柳岩| 鸠摩智| 21| 6| 2| boy| 16|
| 康敏|星宿老仙| 43| 55| 2|girl| 65|
| 苍老师| 段延庆| 0| 44| 1| boy| 54|
| 小泽| 段正淳| 43| 0| 2| boy| 10|
| 波多野结衣| 摘星子| 65| 23| 1|girl| 33|
| 木婉清| 段誉| 65| 23| 1|girl| 33|
| 李莫愁| 王重阳| 25| 23| 2| boy| 33|
+-----------+----+------+-----+-----+----+----------+
In [71]:
#计算平均数
means=df51.select(func.mean('salary').alias('mean_')).first()[0]
means
executed in 181ms, finished 12:33:33 2018-09-16
Out[71]:
38.77777777777778
In [72]:
#spark dataframe 添加常数列要 用lit 函数
Spark 2.2 introduces typedLit to support Seq, Map, and Tuples (SPARK-19254) and following calls should be supported (Scala):
import org.apache.spark.sql.functions.typedLit
df.withColumn("some_array", typedLit(Seq(1, 2, 3)))
df.withColumn("some_struct", typedLit(("foo", 1, .0.3)))
df.withColumn("some_map", typedLit(Map("key1" -> 1, "key2" -> 2)))
scala 才有typelit
#添加score2 一列
from pyspark.sql.functions import lit
df5=df5.withColumn('score2',lit(df5.select(func.mean('salary')).first()[0]))
df5.show()
executed in 264ms, finished 12:33:33 2018-09-16
+-----------+----+------+-----+-----+----+-----------------+
|girl_friend|name|salary|score|class| sex| score2|
+-----------+----+------+-----+-----+----+-----------------+
| 凤姐| 乔峰| 44| 1| 1|girl|38.77777777777778|
| 李宇春| 慕容复| 43| 22| 3|girl|38.77777777777778|
| 柳岩| 鸠摩智| 21| 6| 2| boy|38.77777777777778|
| 康敏|星宿老仙| 43| 55| 2|girl|38.77777777777778|
| 苍老师| 段延庆| 0| 44| 1| boy|38.77777777777778|
| 小泽| 段正淳| 43| 0| 2| boy|38.77777777777778|
| 波多野结衣| 摘星子| 65| 23| 1|girl|38.77777777777778|
| 木婉清| 段誉| 65| 23| 1|girl|38.77777777777778|
| 李莫愁| 王重阳| 25| 23| 2| boy|38.77777777777778|
+-----------+----+------+-----+-----+----+-----------------+
In [73]:
#spark 的 case when
df5=df51.select('*',func.when(df5.score.between(30,80),"good").otherwise('bad').alias('good_or_bad'))
df5.show()
executed in 207ms, finished 12:33:33 2018-09-16
+-----------+----+------+-----+-----+----+----------+-----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|good_or_bad|
+-----------+----+------+-----+-----+----+----------+-----------+
| 凤姐| 乔峰| 44| 1| 1|girl| 11| bad|
| 李宇春| 慕容复| 43| 22| 3|girl| 32| bad|
| 柳岩| 鸠摩智| 21| 6| 2| boy| 16| bad|
| 康敏|星宿老仙| 43| 55| 2|girl| 65| good|
| 苍老师| 段延庆| 0| 44| 1| boy| 54| good|
| 小泽| 段正淳| 43| 0| 2| boy| 10| bad|
| 波多野结衣| 摘星子| 65| 23| 1|girl| 33| bad|
| 木婉清| 段誉| 65| 23| 1|girl| 33| bad|
| 李莫愁| 王重阳| 25| 23| 2| boy| 33| bad|
+-----------+----+------+-----+-----+----+----------+-----------+
In [74]:
#spark 的窗口函数
from pyspark.sql import Window
df6=df51.select('name',
'girl_friend',
'salary',
'score',
func.mean('score').over(Window.partitionBy('girl_friend')).alias('age_w1'),
func.sum('salary').over(Window.partitionBy('name').orderBy('salary')).alias('age_w2')
).sort('salary')
df6.show()
executed in 742ms, finished 12:33:34 2018-09-16
+----+-----------+------+-----+------+------+
|name|girl_friend|salary|score|age_w1|age_w2|
+----+-----------+------+-----+------+------+
| 段延庆| 苍老师| 0| 44| 44.0| 0|
| 鸠摩智| 柳岩| 21| 6| 6.0| 21|
| 王重阳| 李莫愁| 25| 23| 23.0| 25|
| 段正淳| 小泽| 43| 0| 0.0| 43|
|星宿老仙| 康敏| 43| 55| 55.0| 43|
| 慕容复| 李宇春| 43| 22| 22.0| 43|
| 乔峰| 凤姐| 44| 1| 1.0| 44|
| 段誉| 木婉清| 65| 23| 23.0| 65|
| 摘星子| 波多野结衣| 65| 23| 23.0| 65|
+----+-----------+------+-----+------+------+
In [75]:
#pysaprk支持python 自定义函数
concat_func=func.udf(lambda x,y:str(x)+'-'+str(y))
df6=df6.select("*",func.concat_ws('-',df6.name,df6.girl_friend).alias('husband_wife'),
concat_func(df6.name,df6.girl_friend).alias('husband_wife2'))
df6.show()
executed in 2.11s, finished 12:33:36 2018-09-16
+----+-----------+------+-----+------+------+------------+-------------+
|name|girl_friend|salary|score|age_w1|age_w2|husband_wife|husband_wife2|
+----+-----------+------+-----+------+------+------------+-------------+
| 段延庆| 苍老师| 0| 44| 44.0| 0| 段延庆-苍老师| 段延庆-苍老师|
| 鸠摩智| 柳岩| 21| 6| 6.0| 21| 鸠摩智-柳岩| 鸠摩智-柳岩|
| 王重阳| 李莫愁| 25| 23| 23.0| 25| 王重阳-李莫愁| 王重阳-李莫愁|
| 慕容复| 李宇春| 43| 22| 22.0| 43| 慕容复-李宇春| 慕容复-李宇春|
|星宿老仙| 康敏| 43| 55| 55.0| 43| 星宿老仙-康敏| 星宿老仙-康敏|
| 段正淳| 小泽| 43| 0| 0.0| 43| 段正淳-小泽| 段正淳-小泽|
| 乔峰| 凤姐| 44| 1| 1.0| 44| 乔峰-凤姐| 乔峰-凤姐|
| 段誉| 木婉清| 65| 23| 23.0| 65| 段誉-木婉清| 段誉-木婉清|
| 摘星子| 波多野结衣| 65| 23| 23.0| 65| 摘星子-波多野结衣| 摘星子-波多野结衣|
+----+-----------+------+-----+------+------+------------+-------------+
In [76]:
#pyspark dataframe 没有iterrow,不过可以通过udf来自定义
#You can use select method to operate on your dataframe using a user defined function something like this :
# columns = header.columns
# my_udf = F.udf(lambda data: "do what ever you want here " , StringType())
# myDF.select(*[my_udf(col(c)) for c in columns])
# columns=df6.columns
# myudf=func.udf(lambda df: 0 if df.,StringType())
#mid 截取字符串
df6=df6.withColumn('mid_2',df6.name.substr(1,2))
df6.show()
executed in 2.11s, finished 12:33:38 2018-09-16
+----+-----------+------+-----+------+------+------------+-------------+-----+
|name|girl_friend|salary|score|age_w1|age_w2|husband_wife|husband_wife2|mid_2|
+----+-----------+------+-----+------+------+------------+-------------+-----+
| 段延庆| 苍老师| 0| 44| 44.0| 0| 段延庆-苍老师| 段延庆-苍老师| 段延|
| 鸠摩智| 柳岩| 21| 6| 6.0| 21| 鸠摩智-柳岩| 鸠摩智-柳岩| 鸠摩|
| 王重阳| 李莫愁| 25| 23| 23.0| 25| 王重阳-李莫愁| 王重阳-李莫愁| 王重|
|星宿老仙| 康敏| 43| 55| 55.0| 43| 星宿老仙-康敏| 星宿老仙-康敏| 星宿|
| 慕容复| 李宇春| 43| 22| 22.0| 43| 慕容复-李宇春| 慕容复-李宇春| 慕容|
| 段正淳| 小泽| 43| 0| 0.0| 43| 段正淳-小泽| 段正淳-小泽| 段正|
| 乔峰| 凤姐| 44| 1| 1.0| 44| 乔峰-凤姐| 乔峰-凤姐| 乔峰|
| 段誉| 木婉清| 65| 23| 23.0| 65| 段誉-木婉清| 段誉-木婉清| 段誉|
| 摘星子| 波多野结衣| 65| 23| 23.0| 65| 摘星子-波多野结衣| 摘星子-波多野结衣| 摘星|
+----+-----------+------+-----+------+------+------------+-------------+-----+
In [79]:
#替换字符串 replace 跟 pandas series.str.replace差不多
df6=df6.replace(['李宇春','凤姐'],['如花','石榴姐'],'girl_friend')
df6.show()
executed in 2.01s, finished 12:33:40 2018-09-16
+----+-----------+------+-----+------+------+------------+-------------+-----+
|name|girl_friend|salary|score|age_w1|age_w2|husband_wife|husband_wife2|mid_2|
+----+-----------+------+-----+------+------+------------+-------------+-----+
| 段延庆| 苍老师| 0| 44| 44.0| 0| 段延庆-苍老师| 段延庆-苍老师| 段延|
| 鸠摩智| 柳岩| 21| 6| 6.0| 21| 鸠摩智-柳岩| 鸠摩智-柳岩| 鸠摩|
| 王重阳| 李莫愁| 25| 23| 23.0| 25| 王重阳-李莫愁| 王重阳-李莫愁| 王重|
| 段正淳| 小泽| 43| 0| 0.0| 43| 段正淳-小泽| 段正淳-小泽| 段正|
|星宿老仙| 康敏| 43| 55| 55.0| 43| 星宿老仙-康敏| 星宿老仙-康敏| 星宿|
| 慕容复| 如花| 43| 22| 22.0| 43| 慕容复-李宇春| 慕容复-李宇春| 慕容|
| 乔峰| 石榴姐| 44| 1| 1.0| 44| 乔峰-凤姐| 乔峰-凤姐| 乔峰|
| 段誉| 木婉清| 65| 23| 23.0| 65| 段誉-木婉清| 段誉-木婉清| 段誉|
| 摘星子| 波多野结衣| 65| 23| 23.0| 65| 摘星子-波多野结衣| 摘星子-波多野结衣| 摘星|
+----+-----------+------+-----+------+------+------------+-------------+-----+
In [80]:
#统计函数 介乎mllib
'''from pyspark.mllib.stat import Statistics as ss
#summary=ss.colStats(df6.score.getItem)
#summary.max()'''
executed in 3ms, finished 12:33:40 2018-09-16
#rdd takeOrderd take 跟 order 结合体
df9=df8.rdd
df9=df9.takeOrdered(10,lambda (x,y):-y)'''
#去重 distinct count
df7=df6.select('name').distinct().count()
df7
executed in 373ms, finished 12:33:41 2018-09-16
Out[83]:
9
In [84]:
#df filter 根 pandas 也是一模一样,只不过pandas 直接 df=df[df[name]>0 & df[score]>0] spark要加个df.filter()
#pandas df=df[条件]
#spark df=df.filter(条件)
df8=df5.filter((df5['class']==1)&(df5['salary']>0))
df8.show()
executed in 205ms, finished 12:33:41 2018-09-16
+-----------+----+------+-----+-----+----+----------+-----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|good_or_bad|
+-----------+----+------+-----+-----+----+----------+-----------+
| 凤姐| 乔峰| 44| 1| 1|girl| 11| bad|
| 波多野结衣| 摘星子| 65| 23| 1|girl| 33| bad|
| 木婉清| 段誉| 65| 23| 1|girl| 33| bad|
+-----------+----+------+-----+-----+----+----------+-----------+
In [85]:
# filter then select some certain columns
df8=df5.filter((df5['class']==1)&(df5['salary']>0)).select('name','girl_friend','salary')
df8.show()
executed in 190ms, finished 12:33:41 2018-09-16
+----+-----------+------+
|name|girl_friend|salary|
+----+-----------+------+
| 乔峰| 凤姐| 44|
| 摘星子| 波多野结衣| 65|
| 段誉| 木婉清| 65|
+----+-----------+------+
In [86]:
#fillna 跟pandas一抹一眼
df5=df5.fillna(0)
df5.show()
executed in 187ms, finished 12:33:41 2018-09-16
+-----------+----+------+-----+-----+----+----------+-----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|good_or_bad|
+-----------+----+------+-----+-----+----+----------+-----------+
| 凤姐| 乔峰| 44| 1| 1|girl| 11| bad|
| 李宇春| 慕容复| 43| 22| 3|girl| 32| bad|
| 柳岩| 鸠摩智| 21| 6| 2| boy| 16| bad|
| 康敏|星宿老仙| 43| 55| 2|girl| 65| good|
| 苍老师| 段延庆| 0| 44| 1| boy| 54| good|
| 小泽| 段正淳| 43| 0| 2| boy| 10| bad|
| 波多野结衣| 摘星子| 65| 23| 1|girl| 33| bad|
| 木婉清| 段誉| 65| 23| 1|girl| 33| bad|
| 李莫愁| 王重阳| 25| 23| 2| boy| 33| bad|
+-----------+----+------+-----+-----+----+----------+-----------+
In [87]:
#dropna 跟pandas一抹一眼
df5=df5.dropna()
df5.show()
executed in 182ms, finished 12:33:41 2018-09-16
+-----------+----+------+-----+-----+----+----------+-----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|good_or_bad|
+-----------+----+------+-----+-----+----+----------+-----------+
| 凤姐| 乔峰| 44| 1| 1|girl| 11| bad|
| 李宇春| 慕容复| 43| 22| 3|girl| 32| bad|
| 柳岩| 鸠摩智| 21| 6| 2| boy| 16| bad|
| 康敏|星宿老仙| 43| 55| 2|girl| 65| good|
| 苍老师| 段延庆| 0| 44| 1| boy| 54| good|
| 小泽| 段正淳| 43| 0| 2| boy| 10| bad|
| 波多野结衣| 摘星子| 65| 23| 1|girl| 33| bad|
| 木婉清| 段誉| 65| 23| 1|girl| 33| bad|
| 李莫愁| 王重阳| 25| 23| 2| boy| 33| bad|
+-----------+----+------+-----+-----+----+----------+-----------+
In [88]:
#转换数据类型
from pyspark.sql.types import DoubleType
df5=df5.withColumn('salary',df5['salary'].cast(DoubleType()))
df5.show()
executed in 187ms, finished 12:33:41 2018-09-16
+-----------+----+------+-----+-----+----+----------+-----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|good_or_bad|
+-----------+----+------+-----+-----+----+----------+-----------+
| 凤姐| 乔峰| 44.0| 1| 1|girl| 11| bad|
| 李宇春| 慕容复| 43.0| 22| 3|girl| 32| bad|
| 柳岩| 鸠摩智| 21.0| 6| 2| boy| 16| bad|
| 康敏|星宿老仙| 43.0| 55| 2|girl| 65| good|
| 苍老师| 段延庆| 0.0| 44| 1| boy| 54| good|
| 小泽| 段正淳| 43.0| 0| 2| boy| 10| bad|
| 波多野结衣| 摘星子| 65.0| 23| 1|girl| 33| bad|
| 木婉清| 段誉| 65.0| 23| 1|girl| 33| bad|
| 李莫愁| 王重阳| 25.0| 23| 2| boy| 33| bad|
+-----------+----+------+-----+-----+----+----------+-----------+
In [89]:
df5.printSchema
executed in 5ms, finished 12:33:41 2018-09-16
Out[89]:
<bound method DataFrame.printSchema of DataFrame[girl_friend: string, name: string, salary: double, score: bigint, class: bigint, sex: string, agePlusTen: bigint, good_or_bad: string]>
In [90]:
#crosstab 交叉表 类似于透视表
df5.crosstab('class','name').show()
executed in 345ms, finished 12:33:42 2018-09-16
+----------+---+---+---+----+---+---+---+---+---+
|class_name| 乔峰|慕容复|摘星子|星宿老仙|段延庆|段正淳| 段誉|王重阳|鸠摩智|
+----------+---+---+---+----+---+---+---+---+---+
| 2| 0| 0| 0| 1| 0| 1| 0| 1| 1|
| 1| 1| 0| 1| 0| 1| 0| 1| 0| 0|
| 3| 0| 1| 0| 0| 0| 0| 0| 0| 0|
+----------+---+---+---+----+---+---+---+---+---+
#df.dropDuplicates 跟pandas 一模一样
df.dropDuplicates(['name', 'height']).show()
df.show()
executed in 1.24s, finished 12:33:43 2018-09-16
+---+------+-----+
|age|height| name|
+---+------+-----+
| 5| 80|Alice|
| 5| 80|Alice|
| 10| 80|Alice|
+---+------+-----+
+---+------+-----+
|age|height| name|
+---+------+-----+
| 5| 80|Alice|
| 10| 80|Alice|
+---+------+-----+
+---+------+-----+
|age|height| name|
+---+------+-----+
| 5| 80|Alice|
+---+------+-----+
+---+------+-----+
|age|height| name|
+---+------+-----+
| 5| 80|Alice|
| 5| 80|Alice|
| 10| 80|Alice|
+---+------+-----+
In [96]:
#把性别映射成1 2 3 以后可以机器学习
from pyspark.ml.feature import StringIndexer
si=StringIndexer(inputCol='sex',outputCol='sex_indexed')
df5.show()
df6=si.fit(df5).transform(df5)
executed in 365ms, finished 12:50:16 2018-09-16
+-----------+----+------+-----+-----+----+----------+-----------+
|girl_friend|name|salary|score|class| sex|agePlusTen|good_or_bad|
+-----------+----+------+-----+-----+----+----------+-----------+
| 凤姐| 乔峰| 44.0| 1| 1|girl| 11| bad|
| 李宇春| 慕容复| 43.0| 22| 3|girl| 32| bad|
| 柳岩| 鸠摩智| 21.0| 6| 2| boy| 16| bad|
| 康敏|星宿老仙| 43.0| 55| 2|girl| 65| good|
| 苍老师| 段延庆| 0.0| 44| 1| boy| 54| good|
| 小泽| 段正淳| 43.0| 0| 2| boy| 10| bad|
| 波多野结衣| 摘星子| 65.0| 23| 1|girl| 33| bad|
| 木婉清| 段誉| 65.0| 23| 1|girl| 33| bad|
| 李莫愁| 王重阳| 25.0| 23| 2| boy| 33| bad|
+-----------+----+------+-----+-----+----+----------+-----------+