import os
os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"
sc = SparkContext.getOrCreate()
sqlContext = HiveContext(sc)
l = [('Alice', 1),('jack',5),('kuna',10)]
df12 = sqlContext.createDataFrame(l, ['name', 'age']).collect()
l1 = [('xiaohua',5),('xiaoke',10),('Alice', 5)]
df22 = sqlContext.createDataFrame(l1, ['name', 'age']).collect()
df12.collect()
Out[60]:
[Row(name=u'Alice', age=1),
Row(name=u'jack', age=5),
Row(name=u'kuna', age=10)]
df22.collect()
Out[64]:
[Row(name=u'xiaohua', age=5),
Row(name=u'xiaoke', age=10),
Row(name=u'Alice', age=5)]
In[65]: df12.join(df22, df12.name == df22.name, 'inner').collect()
Out[65]: [Row(name=u'Alice', age=1, name=u'Alice', age=5)]
In[66]: df12.join(df22, df12.name == df22.name, 'left_outer').collect()
Out[66]:
[Row(name=u'jack', age=5, name=None, age=None),
Row(name=u'kuna', age=10, name=None, age=None),
Row(name=u'Alice', age=1, name=u'Alice', age=5)]
In[67]: df12.join(df22, df12.name == df22.name, 'right_outer').collect()
Out[67]:
[Row(name=None, age=None, name=u'xiaohua', age=5),
Row(name=u'Alice', age=1, name=u'Alice', age=5),
Row(name=None, age=None, name=u'xiaoke', age=10)]
In[68]: df12.join(df22, df12.name == df22.name, 'leftsemi').collect()
Out[68]: [Row(name=u'Alice', age=1)]
df22.show()
+-------+---+
| name|age|
+-------+---+
|xiaohua| 5|
| xiaoke| 10|
| Alice| 5|
+-------+---+
tmp1=df22.join(df33, df22.name == df33.mingzi, 'left_outer')
In[98]: tmp1.show()
+-------+---+------+----+
| name|age|mingzi| age|
+-------+---+------+----+
|xiaohua| 5| null|null|
| Alice| 5| Alice| 5|
| xiaoke| 10| null|null|
+-------+---+------+----+
In[99]: tmp1.createOrReplaceTempView('tmp1')
In[100]: tmp11 = sqlContext.sql("select count(*) as num from tmp1 where name == mingzi")
In[101]: tmp11
Out[101]: DataFrame[num: bigint]
In[102]: tmp11.show()
+---+
|num|
+---+
| 1|
+---+
In[103]: tmp11 = sqlContext.sql("select count(*) as num from tmp1 where name != mingzi")
In[104]: tmp11.show()
+---+
|num|
+---+
| 0|
+---+
疑问:最后一个num 不应该为2嘛,怎么为1,难道join只默认计算匹配上的个数;