Hive //hadoop mr sql
pheonix //hbase之上构建sql交互过程
该模块能在spark运行sql语句。
DataFrame //收据框.表.
SparkSQL //SQL | DataFrame API.
使用类似SQL方式访问hadoop,实现MR计算。RDD
df = sc.createDataFrame(rdd);
DataSet<Row> === DataFrame ==> //类似于table操作。
RDD[Customer]==>
$scala>df = sc.createDataFrame(rdd);
//创建样例类
$scala>case class Customer(id:Int,name:String,age:Int)
//构造数据
$scala>val arr = Array("1,tom,12","2,tomas,13","3,tomasLee,14")
$scala>val rdd1 = sc.makeRDD(arr)
//创建对象rdd
$scala>val rdd2 = rdd1.map(e=>{e.split(",") ; Customer(arr(0).toInt,arr(1),arr(2).toInt)})
//通过rdd创建数据框
$scala>val df = spark.createDataFrame(rdd2);
//打印表结构
$scala>df.printSchema
$scala>df.show //插叙数据
//创建临时视图
$scala>df.createTempView("customers")
$scala>val df2 = spark.sql("select * from customers")
$scala>spark.sql("select * from customers").show //使用sql语句
$scala>val df1 = spark.sql("select * from cusotmers where id < 2");
$scala>val df2 = spark.sql("select * from cusotmers where id > 2");
$scala>df1.createTempView("c1")
$scala>df2.createTempView("c2")
$scala>spark.sql("select * from c1 union select * from c2").show()
$scala>df1.union(df2);
$scala>spark.sql("select id,name from customers").show
$scala>df.selectExpr("id","name")
$scala>df.where("name like 't%'")
//映射
$scala>df.map(_.getAs[Int]("age")).reduce(_+_) //聚合操作DataSet[Int]
$scala>df.agg(sum("age"),max("age"),min("age")) //聚合函数
spark java操作json数据
package com.mao.scala.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.util.function.Consumer;
/**
* Created by Administrator on 2017/4/3.
*/
public class SQLJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local") ;
conf.setAppName("SQLJava");
SparkSession session = SparkSession.builder()
.appName("SQLJava")
.config("spark.master","local")
.getOrCreate();
Dataset<Row> df1 = session.read().json("file:///d:/mr/sparkJson.dat");
//创建临时视图
df1.createOrReplaceTempView("customers");
df1.show();
Dataset<Row> df3 = df1.where("age > 13");
df3.show();
//按照sql方式查询
Dataset<Row> df2 = session.sql("select * from customers where age > 13");
df2.show();
System.out.println("=================");
//聚合查询
Dataset<Row> dfCount = session.sql("select count(1) from customers");
dfCount.show();
//DataFrame和RDD互操作
JavaRDD<Row> rdd = df1.toJavaRDD();
rdd.collect().forEach(new Consumer<Row>() {
public void accept(Row row) {
long age = row.getLong(0);
long id = row.getLong(1);
String name = row.getString(2);
System.out.println(age + "," + id + "," + name);
}
});
//保存处理,设置保存模式
df2.write().mode(SaveMode.Append).json("file:///d:/mr/json/out.dat");
}
}
json文件的读写
//读取json文件形成df对象
SparkSession.read().json("..");
//将数据框数据写入json文件
SparkSession.write().json("..");