一、建立RDD
path=os.getcwd()
file=patj+"/info"
print('file')
#读取文字并查看资料数量
rawuserrdd=sc.textFile(file)
print(rawuserrdd.count())
#查看前5个资料
print(rawuserrdd.take(5))
print(rawuserrdd.collect())
打印结果
['1|24|M|teacher1|12222', '2|18|F|engineer|12513', '3|23|M|teacher|12222', '4|26|F|docter|125134', '5|34|M|teacher|25222']
['1|24|M|teacher1|12222', '2|18|F|engineer|12513', '3|23|M|teacher|12222', '4|26|F|docter|125134', '5|34|M|teacher|25222', '6|21|F|engineer|125134', '7|24|M|teacher|12222', '8|24|F|engineer|125134', '9|2|M|teacher|124222', '10|124|F|engineer|125134', '12|24|M|teacher|12222', '12|24|F|engineer|1625134', '13|24|M|teacher|12222', '14|24|F|engineer|125134', '15|24|M|teacher|12222', '16|24|F|engineer|125134', '17|24|M|teacher|124222', '18|34|F|engineer|1234', '19|56|M|teacher|12222', '20|24|F|engineer|125134', '21|24|M|teacher|12222', '22|24|F|engineer|125134']
#以|分割
userrdd=rawuserrdd.map(lambda x:x.split("|"))
print(userrdd.take(5))
打印结果
[['1', '24', 'M', 'teacher1', '12222'], ['2', '18', 'F', 'engineer', '12513'], ['3', '23', 'M', 'teacher', '12222'], ['4', '26', 'F', 'docter', '125134'], ['5', '34', 'M', 'teacher', '25222']]
二、建立DF
#建立sqlCOntext
#建立sqlCOntext
sqlContxt=SQLContext(sc)
#定义schema
user_row=userrdd.map(
lambda x:
Row(
userid=int(x[0]),
age=int(x[1]),
gender=x[2],
occupation=x[3],
salary=x[4]
)
)
结果
[Row(age=24, gender='M', occupation='teacher1', salary='12222', userid=1), Row(age=18, gender='F', occupation='engineer', salary='12513', userid=2), Row(age=23, gender='M', occupation='teacher', salary='12222', userid=3), Row(age=26, gender='F', occupation='docter', salary='125134', userid=4), Row(age=34, gender='M', occupation='teacher', salary='25222', userid=5)]
#建立dfs
#建立dfs
user_df=sqlContxt.createDataFrame(user_row)
print(user_df.printSchema())
result:
root
|-- age: long (nullable = true)
|-- gender: string (nullable = true)
|-- occupation: string (nullable = true)
|-- salary: string (nullable = true)
|-- userid: long (nullable = true)
df建立别名
#df建立别名
df=user_df.alias('df')
三、使用SparkSQL
#1注册资料表
user_df.registerTempTable('user_table')
#2使用spark sql查看个数
re=sqlContxt.sql('select count(*) from user_table').show()
#查看性别数
re=sqlContxt.sql('select gender,count(*) from user_table group by gender').show()
#使用spark sql查看资料
re=sqlContxt.sql('select count(*) from user_table').show(2)
result:
+--------+
|count(1)|
+--------+
| 22|
+--------+
[Stage 12:================================================> (65 + 1) / 75]+------+--------+
|gender|count(1)|
+------+--------+
| F| 11|
| M| 11|
+------+--------+
+--------+
|count(1)|
+--------+
| 22|
+--------+
四、select 选取部分信息
#使用rdd选取
userrddnew=userrdd.map(lambda x:(x[0],x[3],x[4]))
# print(userrddnew.take(5))
#使用df选取
#san种方法
user_df.select('userid','occupation','salary').show(5)
user_df.select(df.userid,df.occupation,df.salary).show(5)
user_df.select(df['userid'],df['occupation'],df['salary']).show(5)
#使用Spark SQl选取
sqlContxt.sql('''
select userid,occupation,salary from user_table
''').show()