启动三台机器
先启动Hadoop和spark
sudo cd /usr/local/hadoop
sbin/start-all.sh
cd ../spark
sbin/start-all.sh
cd /usr/local/spark/bin
pyspark
输入
peopleDF=spark.read.format("json").load("file:///usr/local/spark/examples/src/main/resources/people.json")
peopleDF.show()
peopleDF.select("name","age").write.format("json").save("file:///usr/local/spark/mycode/sparksql/newpeople.json")
peopleDF.select("name").write.format("text").save("file:///usr/local/spark/mycode/sparksql/newpeople.txt")
peopleDF=spark.read.format("json").load("file:///usr/local/spark/mycode/sparksql/newpeople.json")
peopleDF.show()
df=spark.read.json("file:///usr/local/spark/examples/src/main/resources/people.json")
- printSchema()
df.printSchema()
- select()
df.select(df["name"],df["age"]+1).show()
- filter()
df.filter(df["age"]>20).show()
- groupBy()
df.groupBy("age").count().show()
- sort()
df.sort(df["age"].desc()).show()
df.sort(df["age"].desc(),df["name"].asc()).show()