RDD、DataFrame、SparkSQL资料类比操作

一、建立RDD

path=os.getcwd()
file=patj+"/info"
print('file')
#读取文字并查看资料数量
rawuserrdd=sc.textFile(file)
print(rawuserrdd.count())
#查看前5个资料
print(rawuserrdd.take(5))


print(rawuserrdd.collect())

打印结果
['1|24|M|teacher1|12222', '2|18|F|engineer|12513', '3|23|M|teacher|12222', '4|26|F|docter|125134', '5|34|M|teacher|25222']
['1|24|M|teacher1|12222', '2|18|F|engineer|12513', '3|23|M|teacher|12222', '4|26|F|docter|125134', '5|34|M|teacher|25222', '6|21|F|engineer|125134', '7|24|M|teacher|12222', '8|24|F|engineer|125134', '9|2|M|teacher|124222', '10|124|F|engineer|125134', '12|24|M|teacher|12222', '12|24|F|engineer|1625134', '13|24|M|teacher|12222', '14|24|F|engineer|125134', '15|24|M|teacher|12222', '16|24|F|engineer|125134', '17|24|M|teacher|124222', '18|34|F|engineer|1234', '19|56|M|teacher|12222', '20|24|F|engineer|125134', '21|24|M|teacher|12222', '22|24|F|engineer|125134']
#以|分割
userrdd=rawuserrdd.map(lambda x:x.split("|"))
print(userrdd.take(5))
打印结果
[['1', '24', 'M', 'teacher1', '12222'], ['2', '18', 'F', 'engineer', '12513'], ['3', '23', 'M', 'teacher', '12222'], ['4', '26', 'F', 'docter', '125134'], ['5', '34', 'M', 'teacher', '25222']]


二、建立DF

#建立sqlCOntext


#建立sqlCOntext
sqlContxt=SQLContext(sc)

#定义schema

user_row=userrdd.map(
    lambda x:
    Row(
        userid=int(x[0]),
        age=int(x[1]),
        gender=x[2],
        occupation=x[3],
        salary=x[4]
    )
)
结果
[Row(age=24, gender='M', occupation='teacher1', salary='12222', userid=1), Row(age=18, gender='F', occupation='engineer', salary='12513', userid=2), Row(age=23, gender='M', occupation='teacher', salary='12222', userid=3), Row(age=26, gender='F', occupation='docter', salary='125134', userid=4), Row(age=34, gender='M', occupation='teacher', salary='25222', userid=5)]

#建立dfs

#建立dfs
user_df=sqlContxt.createDataFrame(user_row)
print(user_df.printSchema())

result:
root
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- userid: long (nullable = true)

df建立别名

#df建立别名
df=user_df.alias('df')

三、使用SparkSQL

#1注册资料表
user_df.registerTempTable('user_table')

#2使用spark sql查看个数
re=sqlContxt.sql('select count(*) from user_table').show()

#查看性别数
re=sqlContxt.sql('select gender,count(*) from user_table group by gender').show()


#使用spark sql查看资料
re=sqlContxt.sql('select count(*) from user_table').show(2)

result:
+--------+
|count(1)|
+--------+
|      22|
+--------+

[Stage 12:================================================>       (65 + 1) / 75]+------+--------+
|gender|count(1)|
+------+--------+
|     F|      11|
|     M|      11|
+------+--------+

+--------+
|count(1)|
+--------+
|      22|
+--------+

四、select 选取部分信息

#使用rdd选取
userrddnew=userrdd.map(lambda x:(x[0],x[3],x[4]))
# print(userrddnew.take(5))


#使用df选取
#san种方法
user_df.select('userid','occupation','salary').show(5)
user_df.select(df.userid,df.occupation,df.salary).show(5)
user_df.select(df['userid'],df['occupation'],df['salary']).show(5)


#使用Spark SQl选取
sqlContxt.sql('''
select userid,occupation,salary from user_table
''').show()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值