package cn.spark.study.sql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;/**
* 创建dataframe*/
public classDataFrameCreate {public static voidmain (String[] args){
SparkConf conf= newSparkConf()
.setAppName("DataFrameCreate")
.setMaster("local");
JavaSparkContext sc= newJavaSparkContext(conf);
SQLContext sqlContext= newSQLContext(sc);
sqlContext.read().json("hdfs://spark1:9000/test.json").show();
}
}//=======================分隔符======================================
package cn.spark.study.sql;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;/**
* dataframe常用操作*/
public classDataFrameOperation {public static voidmain(String [] args){//创建DataFrame
SparkConf conf = newSparkConf()
.setAppName("DataFrameCreate");
JavaSparkContext sc= newJavaSparkContext(conf);
SQLContext sqlContext= newSQLContext(sc);//创建出来的DataFrame完全可以理解为一张表
Dataset json = sqlContext.read().json("hdfs://spark1:9000/students.json");//打印dataframe ;select * from 表名
json.show();//打印dataframe的元数据信息(schema)
json.printSchema();//查询某一列的数据
json.select("name").show();//查询多列 name ,age 并对所有的age列的结果值加1
json.select(json.col("name") , json.col("age").plus(1)).show();//对某一列的值进行过滤;eg:只展示age字段值大于18的数据
json.select(json.col("age").gt(18)).show();//根据某一列进行分组,并聚合;eg:通过age分组,并求出每组的个数
json.groupBy("age").count().show();
}
}