Parquet数据源自动推断分区
java写
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
/**
*Parquet数据源自动推断分区
*/
public class ParquetPartitionDiscovery {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("ParquetPartitionDiscovery")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
DataFrame parquet = sqlContext.read().parquet("D:\\eclipse\\wc\\scalaworid\\users.parquet");
parquet.printSchema(); //打印元数据
//结果
//root
// |-- name: string (nullable = false)
// |-- favorite_color: string (nullable = true)
// |-- favorite_numbers: array (nullable = false)
// | |-- element: integer (containsNull = false)
parquet.show(); //显示数据
//结果
//+------+--------------+----------------+
//| name|favorite_color|favorite_numbers|
//+------+--------------+----------------+
//|Alyssa| null| [3, 9, 15, 20]|
//| Ben| red| []|
//+------+--------------+----------------+
}
}
scala写
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object ParquetLoadData {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ParquetLoadData").setMaster("local")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
val dataFrame: DataFrame = sQLContext.read.parquet("D:\\eclipse\\wc\\scalaworid\\users.parquet")
//按 Name:Alyssa
// Name:Ben 显示
val frame: DataFrame = dataFrame.select("name")
//转换成rdd输出
frame.rdd.map(row => {
"Name: " + row.getString(0)
}).foreach(println)
//foreach全写: .foreach((a:String) => println(a))
//结果
//Name: Alyssa
//Name: Ben
}
}
parquet数据源之元数据合并
scala写
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}
/**
* parquet数据源之元数据合并
*/
object ParquetMergeScheam {
def main(args: Array[String]): Unit = {
//如果往hdfs上存储,则会出用户不匹配问题,这个时候在main函数下,程序上,加
//System.setProperty("HADOOP_USER_NAME", "root")
val conf = new SparkConf()
.setMaster("local")
.setAppName("ParquetMergeScheam")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
//创建一个dataFrame作为学生的基本信息,并写入hdfs的parquet文件中
import sQLContext.implicits._
val sNameAndAge = Array(("xiaoming",20),("liubang",26))
val sNameAndAgeDF: DataFrame = sc.parallelize(sNameAndAge).toDF("name","age")
sNameAndAgeDF.save("D:\\eclipse\\wc\\scalaworid\\Parqueta","parquet",SaveMode.Append)
val snameAndGrade = Array(("daji", "B"),("zhaoyun","A"))
val snameAndGradeDF: DataFrame = sc.parallelize(snameAndGrade).toDF("name","greade")
snameAndGradeDF.save("D:\\eclipse\\wc\\scalaworid\\Parqueta","parquet",SaveMode.Append)
//第一组数据与第二组数据的DataFrame的元数据不同
//自动合并两个文件的元数据,查询是的形式是:name,age,grade
//使用mergeSchema合并元数据
val students: DataFrame = sQLContext.read.option("mergeSchema","true").parquet("D:\\eclipse\\wc\\scalaworid\\Parqueta")
students.printSchema() // 元数据
//结果
//root
// |-- name: string (nullable = true)
// |-- age: integer (nullable = true)
// |-- greade: string (nullable = true)
students.show() //内容
//结果
//+--------+----+------+
//| name| age|greade|
//+--------+----+------+
//| daji|null| B|
//| zhaoyun|null| A|
//|xiaoming| 20| null|
//| liubang| 26| null|
//+--------+----+------+
}
}
JSON数据源
注:DataFrame类型查询步骤:
1.是创建临时表,在用sql(这里面是sql语句)方法查询
2.直接用.select()方法等查询
java写
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* JSON数据源
*/
public class JsonDataSource {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("JsonDataSource");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//一:对json文件,创建DataFrame
DataFrame studentScoreDF = sqlContext.read().json("D:\\eclipse\\wc\\scalaworid\\people.json");
//查询学生成绩大于80的人
//注:DataFrame类型查询,是创建临时表,在用sql()方法查询
studentScoreDF.registerTempTable("student_score");
DataFrame goodsStudentScoreDF = sqlContext.sql("select name,score from student_score where score >=80");
//转换成List,下面操作会用到
List<String> goodsStudentNames = goodsStudentScoreDF.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.getString(0);
}
}).collect();
//二:针对JavaRDD<String>创建DataFram
//针对包含json串的JavaRDD,创建DataFrame
List<String> studentInfosJson = new ArrayList<String>();
//这种json格式 {"name":"Leo", "score":85} 里面是String
studentInfosJson.add("{\"name\":\"Michael\", \"age\":18}");
studentInfosJson.add("{\"name\":\"Andy\", \"age\":17}");
studentInfosJson.add("{\"name\":\"Jack\", \"age\":19}");
JavaRDD<String> StudentInfoJsonRDD = sc.parallelize(studentInfosJson);
DataFrame studentInfoDF = sqlContext.read().json(StudentInfoJsonRDD);
//针对学生基本信息DataFrame注册临时表,然后查询分数大于80分的学生的基本信息
studentInfoDF.registerTempTable("student_info");
//接下来要写成这种的sql语句 select name,age from student_info where name in ('leo' ,'marry')
String sql = "select name,age from student_info where name in (";
for (int i = 0;i<goodsStudentNames.size() ; i++){
sql += "'" + goodsStudentNames.get(i) + "'";
if(i<goodsStudentNames.size() -1){
sql += ",";
}
}
sql += ")";
DataFrame goodStudentInfoDF = sqlContext.sql(sql);
//然后将两份数据的DataFrame,转换成JavaPairRDD,执行join transformation
//将DataFrame转换成JavaRDD,在map为JavaPairRDD,然后进行join
//(name,score)
JavaPairRDD<String, Integer> goodsStudentScoreTuple = goodsStudentScoreDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(row.getString(0),
Integer.valueOf(String.valueOf(row.getLong(1))));
}
});
//(name,age)
JavaPairRDD<String, Integer> goodStudentInfoTuple = goodStudentInfoDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<String, Integer>(row.getString(0),
Integer.valueOf(String.valueOf(row.getLong(1))));
}
});
JavaPairRDD<String, Tuple2<Integer, Integer>> join = goodsStudentScoreTuple.join(goodStudentInfoTuple);
//然后将封装在RDD中是好学生的全部信息,转换成一个JavaRDD<Row>的格式
JavaRDD<Row> goodStudentRowRDD = join.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple2) throws Exception {
return RowFactory.create(tuple2._1, tuple2._2._1, tuple2._2._2);
}
});
//创建一份元数据,将JavaRDD<Row>转换成DataFrame
List<StructField> studentFields = new ArrayList<StructField>();
studentFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
studentFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
studentFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
StructType structType = DataTypes.createStructType(studentFields);
//将JavaRDD转换成DataFrame
DataFrame dataFrame = sqlContext.createDataFrame(goodStudentRowRDD, structType);
dataFrame.write().format("json").save("D:\\eclipse\\wc\\scalaworid\\dataframe");
//结果
//{"name":"Michael","score":89,"age":18}
//{"name":"Andy","score":83,"age":17}
}
}
scala写
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 两种都去读取json的方式创建DataFrame
* 1.直接读取json文件
* 2.加载RDD但是RDD中的元素是满足json格式的String类型
*/
object JsonDataScore2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("JsonDataScore2")
.setMaster("local")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
//创建学生成绩DataFrame
val studentScoreDF: DataFrame = sQLContext.read.json("D:\\eclipse\\wc\\scalaworid\\people.json")
//查询分数大于80的学生
val goodStudentScoreDF: DataFrame = studentScoreDF.filter(studentScoreDF.col("score") >= 80)
val goodStudentNames: Array[String] = goodStudentScoreDF.map(row => row.getString(0)).collect()
//创建学生基本信息数据
val studentInfosJSONs = Array("" +
"{\"name\":\"Michael\", \"age\":18}",
"{\"name\":\"Andy\", \"age\":17}",
"{\"name\":\"Jack\", \"age\":19}")
val studentInfoJsonRDD: RDD[String] = sc.parallelize(studentInfosJSONs,1)
val studentInfoDF: DataFrame = sQLContext.read.json(studentInfoJsonRDD)
//查询出学生成绩大于80分的基本信息
studentInfoDF.registerTempTable("student_info")
var sql = "select name,age from student_info where name in ("
for (i <- 0 until goodStudentNames.length){
sql += "'" + goodStudentNames(i) + "'"
if (i < goodStudentNames.length-1){
sql += ","
}
}
sql += ")"
val goodStudentInfoDF = sQLContext.sql(sql)
// goodStudentInfoDF.foreach(row => {
// println(row.getString(0)+","+row.getLong(1))
// })
//
val goodStudentRDD: RDD[(String, (Long, Long))] = goodStudentScoreDF.rdd.map(row => (row.getAs[String]("name"), row.getAs[Long]("score")))
.join(goodStudentInfoDF.rdd.map(row => (row.getAs[String]("name"), row.getAs[Long]("age"))))
val goodStudentRowRDD: RDD[Row] = goodStudentRDD.map(info => {
//注意数据类型与67和68行的元数据类型匹配
Row(info._1, info._2._1.toInt, info._2._2.toString.toInt)
})
val structType = StructType(Array(
StructField("name",StringType,true),
StructField("score",IntegerType,true),
StructField("age",IntegerType,true)))
val goodStudentsDF: DataFrame = sQLContext.createDataFrame(goodStudentRowRDD,structType)
goodStudentsDF.write.format("json").save("D:\\eclipse\\wc\\scalaworid\\good_student_score_scala")
//结果
//{"name":"Michael","score":89,"age":18}
//{"name":"Andy","score":83,"age":17}
}
}