1、如何对DataFrame中封装的数据进行操作
方法一:
sqlContext.read()
返回DataFrameReader对象
sqlContext.read().json("student.json")
读取一个json文件(这个json文件中的内容不能是嵌套的)读进来变成DataFrame,
df.select("age").show(),如果没有show,这个程序就不会执行,这个show就类似与Spark中Action类型的算子,触发执行
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class TestSparkSQL {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("DataFrameOps").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
DataFrame df = sqlContext.read().json("people.json");
/*
* 操作DataFrame的第一种方式
* */
//类似 SQL的select from table;
df.show();
//desc table
df.printSchema();
//select age from table;
df.select("age").show();
//select name from table;
df.select("name").show();
//select name,age+10 from table;
df.select(df.col("name"),df.col("age").plus(10)).show();
//select * from table where age > 20
df.filter(df.col("age").gt(20)).show();
}
}
通过df.show() df.printSchema()便可以查看该df中的信息。
方法二:通过注册临时表,传入SQL语句
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class TestSparkSQL {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("DataFrameOps").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
DataFrame df = sqlContext.read().json("people.json");
//将DataFrame中封装的数据注册为一张临时表,对临时表进行sql操作
df.registerTempTable("people");
DataFrame sql = sqlContext.sql("SELECT * FROM people WHERE age IS NOT NULL");
sql.show();
}
}