《深入理解Spark》之SparkSql复习

最新推荐文章于 2024-06-08 11:34:04 发布

lyzx_in_csdn

最新推荐文章于 2024-06-08 11:34:04 发布

阅读量366

点赞数

分类专栏： Spark

本文链接：https://blog.csdn.net/lyzx_in_csdn/article/details/79714404

版权

Spark 专栏收录该内容

35 篇文章 1 订阅

订阅专栏

XML Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

package com.lyzx.day42

import java.sql.DriverManager
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.{Row, SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}

class D42 {

  /**
    * 通过定义schema确定字段的名称和字段类型
    * @param ctx
    */
  def f2(ctx:SQLContext): Unit ={
    import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
    val sc = ctx.sparkContext
    val data = sc.textFile("./Person.txt")
                 .map(_.split(","))
                 .map(x=>Row(x(0).toInt,x(1),x(2).toInt))

    val schema = StructType(Array(StructField("_id",IntegerType,false),StructField("_name",StringType,false),StructField("_age",IntegerType,false)) )

    val df = ctx.createDataFrame(data,schema)
        df.registerTempTable("person")
    val result = ctx.sql("select _id,_name,_age as age from person where _age >= 46")
//    result.foreach(x=>{println(x.getAs[Int]("_id")+1000,"-",x.getAs("age"),"-",x.getAs("_name"))})
//    将一个文件存储为parquet格式的文件
//    SaveMode.Append           追加
//    SaveMode.ErrorIfExists    默认
//    SaveMode.Ignore           如果存在就不写即存在就忽略
//    SaveMode.Overwrite        覆盖
    result.write.mode(SaveMode.Ignore).parquet("./p1.parquet")
    result.write.json("./result.json")
  }

  //把parquet格式的文件读入程序中
  def f3(ctx:SQLContext): Unit ={
//      val df = ctx.read.format("parquet").load("./p1.parquet")
//      val df = ctx.read.format("json").load("./result.json")
        val df = ctx.read.format("json").load("hdfs://master:9000/sparkSql")
            df.foreach(println)
  }

  /**
    * parquet的自动分区
    * 把parquet文件放到对应的key=value的名字的文件夹下时
    * 在查询时会自动添加key这个列值全部为value
    * 比如在/user/contry=china/cx下存放一个parquet文件,那么在查询时会自动添加contry这个列并且值为china
    * @param ctx
    */
  def f6(ctx:SQLContext): Unit ={
    import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
    val sc = ctx.sparkContext
    val data = sc.textFile("./Person.txt")
      .map(_.split(","))
      .map(x=>Row(x(0).toInt,x(1),x(2).toInt))

    val schema = StructType(
      Array(StructField("_id",IntegerType,false),StructField("_name",StringType,false),StructField("_age",IntegerType,false))
    )

    val person = ctx.createDataFrame(data,schema)
    person.write.mode(SaveMode.Overwrite).parquet("hdfs://master:9000/sparkSql/gender=male/country=china")
  }

  //通过jdbc读取mysql中的表
  def f8(ctx:SQLContext):Unit ={
    val info = Map("url"->"jdbc:mysql://localhost:3306/mytest",
    "dbtable"->"score",
    "user"->"root",
    "password"->"123456",
    "diver"->"com.mysql.jdbc.Driver")
    val df = ctx.read.format("jdbc").options(info).load()
        df.registerTempTable("score")

    val result = ctx.sql("select * from score")
    result.show(3)
  }

   //通过jdbc把数据写入数据库
  def f9(ctx:SQLContext): Unit ={
    val textScore = ctx.read.format("json").load("./score.json")
        textScore.registerTempTable("textScore")
    val result = ctx.sql("select id,score from textScore")
        result.foreachPartition(itr=> {
          Class.forName("com.mysql.jdbc.Driver")
          val conn = DriverManager.getConnection("jdbc:mysql://localhost/mytest","root","123456")
          val ps = conn.prepareStatement("insert into score values(?,?)")
          for(item <- itr){
            val id = item.getAs[Long]("id")
            val name = item.getAs[Long]("score")
            println(id+" ::: "+name)
            ps.setLong(1,id)
            ps.setString(2,name.toString)
            ps.executeUpdate()
          }
        })
  }

  /**
    * UDF 用户自定义函数
    * @param ctx
    */
  def f10(ctx:SQLContext){
      //注册函数并实现函数的函数体
      ctx.udf.register("myLen",(param:String)=>{param.length})

      val score = ctx.read.format("json").load("./score.json")
          score.registerTempTable("score")
      val result = ctx.sql("select id,score,myLen(score) as len from score")
      result.show()
  }

  /**
    * UserDefinedAggregateFunction
    * UDAF  用户自定义聚合函数
    */
  def f11(ctx:SQLContext): Unit ={
      val score = ctx.read.format("json").load("./score.json")
          score.registerTempTable("score")

    import org.apache.spark.sql.types._
    /**
      * 使用UDAF用户自定义函数,实现类似于SQL语句中的Count功能
      */
    ctx.udf.register("myCount",new UserDefinedAggregateFunction {

        //输入数据的类型是String类型
        override def inputSchema: StructType = {
          StructType(Array(StructField("str",StringType,true)))
        }

        //最终的返回值
        override def dataType: DataType = IntegerType

        //确定性
        override def deterministic: Boolean = true

      /**
        * update相当于map端的combiner
        * buffer里面存放上一次聚合的数据,input存放当前聚合的数据
        */
        override def update(buffer: MutableAggregationBuffer,input:Row): Unit = {
          buffer(0) = buffer.getAs[Integer](0)+1
          val item = input.size
          println("item="+item)
        }

        //操作的中间结果
        override def bufferSchema: StructType = StructType(Array(StructField("str",IntegerType,true)))

      /**
        * 相当于reduce端的大聚合
        * @param buffer1
        * @param buffer2
        */
        override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
          buffer1(0)=buffer1.getAs[Integer](0)+buffer2.getAs[Integer](0)
        }

        //初始值
        override def initialize(buffer: MutableAggregationBuffer): Unit = {
          println("buffer.size="+buffer.size)
          buffer(0)=0
        }
        override def evaluate(buffer: Row): Any = buffer.getAs[Integer](0)
      })

      val result = ctx.sql("select id,count(score),myCount(id) as myCount from score group by id")
      result.show()
  }
}

object D42{

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("day42").setMaster("local")
    val sc = new SparkContext(conf)
    val ctx = new SQLContext(sc)
    val d = new D42
//    d.f1(ctx)
//    d.f2(ctx)
//    d.f3(ctx)
//    d.f4(ctx)
//    d.f5(ctx)
//    d.f6(ctx)
//    d.f7(ctx)
//    d.f8(ctx)
//    d.f9(ctx)
//    d.f10(ctx)
    d.f11(ctx)
  }
}

lyzx_in_csdn

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
《深入理解Spark》之SparkSql复习

XML Code 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 3...
复制链接

扫一扫

专栏目录