为Spark SQL添加一条自定义命令
SHOW VERSION
显示当前Spark版本和Java版本(额外补充Scala版本)
版本记录
IDEA版本:2019.3.5
本地Java版本:1.8.201
本地Maven版本:3.6.3
Spark版本:3.1.2
版本问题困惑了好久,最后更换了IDEA版本后成功
一、Spark SQL添加命令
1、在SqlBase.g4添加语法规则
src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
statement
| SHOW VERSION #showVersion
ansiNonReserved
| VERSION
nonReserved
| VERSION
//--SPARK-KEYWORD-LIST-START
VERSION: 'VERSION' | 'V';
2、编译antlr
通过本地maven插件编译
3、编译生成SparkSqlParser.scala
添加一个visitShowVersion()方法
在visitShowVersion()方法中去调用ShowVersionCommand()样例类
此时还需要添加ShowVersionCommand()样例类
ShowVersionCommand()是一个样例类,这样可以直接调用该方法,不需要new
override def visitShowVersion(ctx: ShowVersionContext): LogicalPlan = withOrigin(ctx) {
ShowVersionCommand()
}
4、创建ShowVersionCommand()样例类
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.command
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.StringType
case class ShowVersionCommand() extends RunnableCommand {
override val output: Seq[Attribute] =
Seq(AttributeReference("version", StringType)())
override def run(sparkSession: SparkSession): Seq[Row] = {
val sparkVersion = sparkSession.version
val javaVersion = System.getProperty("java.version")
val scalaVersion = scala.util.Properties.releaseVersion
val output = "Spark Version: %s, Java Version: %s, Scala Version: %s"
.format(sparkVersion, javaVersion, scalaVersion.getOrElse(""))
Seq(Row(output))
}
}
build/mvn clean package -DskipTests -Phive -Phive-thriftserver
添加命令在版本上花费的时间太长,后面两个题目还没有研究,后面补充,先完善一下上次的倒排索引和Distcp的Spark的实现
二、倒排索引改进
package cn.hzbstart
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object Main {
def main(args: Array[String]): Unit = {
// 1、根据输入参数构造一个文件列表
// 2、根据文件列表构造一个RDD联合RDD
// 3、参考WordCount构造词频
// 4、根据输入的要求调整格式
val sparkConf = new SparkConf().setMaster("local").setAppName(this.getClass.getName)
val sc = new SparkContext(sparkConf)
sc.setLogLevel("WARN")
val input = "datas/input"
val fs = FileSystem.get(sc.hadoopConfiguration)
val fileList = fs.listFiles(new Path(input), true)
// filename, word => (0,it)、(0,is)
var rdd1 = sc.emptyRDD[(String, String)]
while (fileList.hasNext) {
val path = fileList.next
val fileName = path.getPath.getName
rdd1 = rdd1.union(sc
.textFile(path.getPath.toString)
.flatMap(_.split("\\s+"))
.map((fileName, _)))
}
println("---" * 100)
rdd1.foreach(println)
println("---" * 100)
// ((0,it),2)
val rdd2 = rdd1.map((_, 1)).reduceByKey(_ + _)
println("---" * 100)
rdd2.foreach(println)
println("---" * 100)
// data._1._2 => it data._1._1 => 文件名 data._2.toString => 词频
// (it,(1, 1))+(it,(0, 2))+(it,(2, 1)) => (it,(1, 1)(0, 2)(2, 1))
val rdd3 = rdd2.map(data => (data._1._2, String.format("(%s, %s)", data._1._1, data._2.toString)))
.reduceByKey(_ + _)
println("---" * 100)
rdd3.foreach(println)
println("---" * 100)
// "it", {(1, 1)(0, 2)(2, 1)}
val rdd4 = rdd3.map(data => String.format("\"%s\": {%s}", data._1, data._2))
println("---" * 100)
rdd4.foreach(println)
println("---" * 100)
// rdd4.saveAsTextFile("datas/output")
}
}
三、Distcp的Spark的实现
package cn.hzbstart
import org.apache.commons.cli.DefaultParser
import org.apache.commons.cli.Options
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object DistCp {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local").setAppName(this.getClass.getName)
val sc = new SparkContext(sparkConf)
sc.setLogLevel("WARN")
val input = "datas/input"
val output = "datas/output"
// Parse args
val options = new Options()
options.addOption("i", "ignore failure", false, "ignore failure")
options.addOption("m", "maximum concurrency", true, "maximum concurrency")
val parser = new DefaultParser()
val cmd = parser.parse(options, args)
val IGNORE_FAILURE = cmd.hasOption("i")
val MAXIMUM_CONCURRENCY = if (cmd.hasOption("m")) cmd.getOptionValue("m").toInt else 2
val fs = FileSystem.get(sc.hadoopConfiguration)
val fileList = fs.listFiles(new Path(input), true)
val arrayBuffer = ArrayBuffer[String]()
while (fileList.hasNext) {
val path = fileList.next().getPath.toString
arrayBuffer.append(path)
println(path)
}
val rdd = sc.makeRDD(arrayBuffer, MAXIMUM_CONCURRENCY)
rdd.foreachPartition(it => {
val conf = new Configuration()
val sfs = FileSystem.get(conf)
while (it.hasNext) {
val src = it.next()
val tgt = src.replace(input, output)
try {
FileUtil.copy(sfs, new Path(src), sfs, new Path(tgt), false, conf)
} catch {
case e: Exception =>
if (IGNORE_FAILURE) println("While copying, ignore exception")
else throw e
}
}
})
}
}