需求描述
有下列表的结构
Emp (
Eno CHAR(4),
Ename CHAR(8),
Esex CHAR(1) CHECK(Esex IN ('M','F')),
EDno CHAR(4) REFERENCES Dept (Dno),
PRIMARY KEY (Eno)
);
Dept (
Dno CHAR(4) NOT NULL UNIQUE,
Dname CHAR(20),
Daddr CHAR(30)
);
完成
条件为Esex=‘M’,对Dname进行统计,将结果输出到XX目录下,存储格式用Parquet,用‘\t’分隔
主要为了记录
1.使用spark,sparksql如何进行Join并统计分析
2.如何存储为Parquet格式文件,并用'\t'分隔
spark部分
package scala
import org.apache.avro.Schema
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.{AvroJob, AvroKeyOutputFormat}
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.sql.{Row, SparkSession, types}
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
//用spark完成对数据的提取和存储操作
//用Spark和Spark SQL分别实现以下功能:提取Eno,Ename,Esex,Dname,Daddr,用Avro格式将数据存到“/Quiz/名字/”目录下
object quiz2_for_spark_521 {
def main(args: Array[String]) {
if (args.length < 2) {
println("Usage:SparkWordCount FileName")
System.exit(1)
}
val conf = new SparkConf().setAppName("log_deal").setMaster("local")
val sc = new SparkContext(conf)
val sparkSession = SparkSession.builder().appName("RDD to DataFrame").config(conf).getOrCreate()
val Deptfile = sc.textFile(args(1))
val DeptRDD=Deptfile.map{l =>
val line = l.split(',')
val dno=line(0)
val dname=line(1)
val daddr=line(2)
(dno,dname+','+daddr)
}
val Empfile = sc.textFile(args(0))
val EmpRDD=Empfile.map{l =>
val line=l.split(',')
val name=line(1)
val sex=line(2)
val edno=line(3)
(edno,line(0)+','+name+','+sex)}
//join两个表即可得结果,使用map语句进行数据的筛选,使用reduce统计
val joinRDD=EmpRDD.join(DeptRDD).map(x=>x._2._1+','+x._2._2).filter(l=>l.contains("M")).map { x =>
val line = x.split(",")
val dname = line(3)
(dname, 1)
}.reduceByKey(_+_).map( x => Row(x._1,x._2.toInt))
joinRDD.foreach(println)
val schema = StructType(
Seq(
StructField("name",types.StringType,true)
,StructField("count",IntegerType,true)
)
)
val df=sparkSession.createDataFrame(joinRDD,schema)
df.write.format("parquet").option("delimiter", "\t").save(args(2))
sc.stop()
}
}
sparkSQL部分
package scala
import org.apache.spark.sql.{Row, SparkSession, types}
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import com.databricks.spark.avro._
//对上一提结果进行统计,Esex=‘M’,对Dname进行统计,将结果输出到xx目录下,存储格式用Parquet,用‘\t’分隔
object quiz2_for_sql_521 {
def main(args: Array[String]) {
if (args.length < 2) {
println("Usage:SparkWordCount FileName")
System.exit(1)
}
val conf = new SparkConf().setAppName("log_deal").setMaster("local")
val sc = new SparkContext(conf)
val sparkSession = SparkSession.builder().appName("RDD to DataFrame")
.config(conf).getOrCreate()
val EmpFile = sc.textFile(args(0))
val EmpRDD = EmpFile.map{l=>
val l_l=l.split(',')
val eno=l_l(0)
val ename=l_l(1)
val esex= l_l(2)
val edno = l_l(3)
//注意这个Row很关键,只有将类型变成了Row后面才能完成RDD转换成DataFrame的操作
Row(eno,ename,esex,edno)}
val schema_E = StructType(
Seq(
StructField("Eno",types.StringType,true)
,StructField("Ename",types.StringType,true)
,StructField("Esex",types.StringType,true)
,StructField("Dno",types.StringType,true)
)
)
val df_E=sparkSession.createDataFrame(EmpRDD,schema_E)
// df_E.createTempView("E")
val DeptFile = sc.textFile(args(1))
val DeptRDD = DeptFile.map{l=>
val l_l=l.split(',')
val dno=l_l(0)
val dname=l_l(1)
val daddr= l_l(2)
Row(dno,dname,daddr)}
val schema = StructType(
Seq(
StructField("Dno",types.StringType,true)
,StructField("Dname",types.StringType,true)
,StructField("Daddr",types.StringType,true)
)
)
val df_D=sparkSession.createDataFrame(DeptRDD,schema)
// df_D.createTempView("D")
val final_df=df_E.join(df_D,"Dno")
final_df.createTempView("E_D")
//直接使用SQL语句就能操作
val results = sparkSession.sql("SELECT Dname,count(Esex) as Number FROM E_D where Esex='M' group by Dname")
results.show()
results.write.format("parquet").option("delimiter", "\t").save(args(2))
sc.stop()
}
}