spark定制之四:完整的start.scala

本文档介绍了如何使用Spark自定义配置,从HDFS读取数据创建内存表`person`,并执行SQL查询计算平均年龄,将结果保存到文件。通过HiveContext简化Spark数据处理,提供了一个需要编译的示例。
摘要由CSDN通过智能技术生成

从HDFS中读入,并定义内存表person:"create table person (name string,age int,weight double)" from "hdfs:/test/testperson"

用sql语句计算查询,结果直接写入文件(可以是本地可以是hdfs):"select avg(age) from person" hqlsaveto "averageage.txt"

让spark处理数据更简单(此版本需要用到HiveContext,需要重新编译,不用编译的SQLContext版本见spark定制之六,程序7月10日更新):


import org.apache.spark.sql.SchemaRDD

var FIELD_SEPERATOR = "\t"
var RECORD_SEPERATOR = "\n"
var lastrdd : SchemaRDD = null

object MyFileUtil extends java.io.Serializable {
    import org.apache.hadoop.fs.Path
    import org.apache.hadoop.fs.FileSystem
    import org.apache.hadoop.fs.FileStatus
    import scala.collection.mutable.ListBuffer

    def regularFile(filepath:String):String = {
        if(filepath == "") {
            filepath;
        } else if(filepath.startsWith("hdfs:")) {
            filepath
        } else if(filepath.startsWith("file:")) {
            filepath
        } else if(filepath.startsWith("/")) {
            "file://" + filepath
        } else {
            val workdir = System.getProperty("user.dir")
            "file://" + workdir + "/" + filepath
        }
    }

    var SAFEMINPATH_LENGTH : Int = 24

    def getFileSystem(filepath:String) = {
        if(filepath.startsWith("hdfs:")) {
            FileSystem.get(new org.apache.hadoop.conf.Configuration());
        } else if(filepath.startsWith("file:")) {
            FileSystem.getLocal(new org.apache.hadoop.conf.Configuration());
        } else {
            throw new Exception("file path invalid")
        }
    }

    def deletePath(filepath:String) = {
        if(filepath.length < SAFEMINPATH_LENGTH)
            throw new Exception("file path is to short")
        var fs : FileSystem = getFileSystem(filepath)
        if (fs.exists(new Path(filepath))) {
            fs.delete(new Path(filepath), true);
        }
    }

    def listFile(fs:FileSystem, path:Path, pathlist:ListBuffer[Path], statuslist:ListBuffer[FileStatus]=null) {
        if ( fs.exists(path) ) {
            val substatuslist =  fs.listStatus(path);
            for(substatus <- substatuslist){
                if(statuslist != null)
                    statuslist.append(substatus)
                if(substatus.isDir()){
                    listFile(fs,substatus.getPath(),pathlist);
                }else{
                    pathlist.append(substatus.getPath());
                }
            }
        }
    }

    def hasContext(filepath:String) = {
        val realpath = regularFile(filepath)
        val fs = getFileSystem(realpath) 
        val pathlist = ListBuffer[Path]()
        val statuslist = ListBuffer[FileStatus]()
        listFile(fs,new Path(filepath),pathlist,statuslist)
        var length:Long = 0
        for( status <- statuslist )
            length += status.getLen()
        length > 0
    }
}

org.apache.spark.repl.Main.interp.command("""
class MySchemaRDD(rdd:org.apache.spark.sql.SchemaRDD) extends java.io.Serializable {

    def go() = {
        var startstr = ""
        var endstr = RECORD_SEPERATOR
        val result = rdd.collect
        result.foreach( x =>
            print(x.mkString(startstr,FIELD_SEPERATOR,endstr))
          )
    }

    def result() = {
        var startstr = ""
        var endstr = RECORD_SEPERATOR
        rdd.collect
    }

    def saveto(output: String) = {
        import org.apache.hadoop.io.{NullWritable,Text}
        var startstr = ""
        var endstr = RECORD_SEPERATOR
        if(output.startsWith("hdfs:")) {
            val outputpath = MyFileUtil.regularFile(output)
            MyFileUtil.deletePath(outputpath)
            rdd.map(x => 
                  (NullWritable.get(), new Text(x.mkString(FIELD_SEPERATOR)))
                ).saveAsHadoopFile[
                  org.apache.hadoop.mapred.TextOutputFormat[NullWritable, Text]
                ](outputpath)
        } else {
            val outputpath = MyFileUtil.regularFile(output)
            MyFileUtil.deletePath(outputpath)
            val result = rdd.collect()
            val writer = new java.io.FileWriter(output)
            result.foreach(x => 
                writer.write(x.mkString(startstr,FIELD_SEPERATOR,endstr))
              )
            writer.close()
        }
    }
}
object MySchemaRDD {
    implicit def toMySchemaRDD(rdd:org.apache.spark.sql.SchemaRDD) = new MySchemaRDD(rdd)
}
""")

val hive = new org.apache.spark.sql.hive.HiveContext(sc)
import hive._
import MySchemaRDD._

def getRegisterString(rddname:String,classname:String,tablename:String,tabledef:String) : String = {
    val members = tabledef.trim.split(",").map(_.trim.split(" ").filter(""!=)).map(x => (x(0).trim,x(1).trim.head.toString.toUpperCase+x(1).trim.tail))
    val classmemberdef = members.map(x => (x._1+":"+x._2)).mkString(",")
    val convertstr = members.map(x => x._2).zipWithIndex.map(x => "t("+x._2+").to"+x._1).mkString(",")
    return s"""
        case class ${classname}(${classmemberdef})
        val schemardd = ${rddname}.map(_.split("${FIELD_SEPERATOR}")).map(t=>${classname}(${convertstr}))
        hive.registerRDDAsTable(schemardd,"${tablename}")
    """
}

org.apache.spark.repl.Main.interp.command("""
class MyCommandTranslator(cmd:String) extends java.io.Serializable {

    def sqlgo()(implicit f: SchemaRDD => MySchemaRDD) = {
        lastrdd = sql(cmd)
        lastrdd.go()
    }

    def sqlsaveto(output: String)(implicit f: SchemaRDD => MySchemaRDD) = {
        lastrdd = sql(cmd)
        lastrdd.saveto(output)
    }

    def hqlgo()(implicit f: SchemaRDD => MySchemaRDD) = {
        lastrdd = hql(cmd)
        lastrdd.go()
    }

    def hqlsaveto(output: String)(implicit f: SchemaRDD => MySchemaRDD) = {
        lastrdd = hql(cmd)
        lastrdd.saveto(output)
    }

    def hqlresult()(implicit f: SchemaRDD => MySchemaRDD) = {
        lastrdd = hql(cmd)
        lastrdd.result()
    }

    def defineas(tabledef:String) = {
        if( tabledef != "" ) {
            org.apache.spark.repl.Main.interp.command( 
                getRegisterString(cmd,cmd.toUpperCase,cmd,tabledef)
            )
        } else {
            org.apache.spark.repl.Main.interp.command(
                "hive.registerRDDAsTable(${cmd},\"${cmd}\")"
            )
        }
    }

    def from(filepath:String) {
        if( cmd.trim.startsWith("create table ") ) {
            val tablename = cmd.trim.substring(13).trim().split(" ")(0)
            val leftstr = cmd.substring(13).trim().substring(tablename.length).trim()
            val tabledef = leftstr.substring(1,leftstr.length-1).trim()
            val realfile = MyFileUtil.regularFile(filepath)
            org.apache.spark.repl.Main.interp.command(
                "val "+tablename+" = sc.textFile(\""+realfile+"\")"
            )
            new MyCommandTranslator(tablename).defineas(tabledef)
        } else {
            println("usage:")
            println("\"create table sometablename (field1 string,field2 int...)\" from \"somefile or hdfs:somepath\"")
        }
    }

    def isok() = {
        if(cmd.contains(".") || cmd.contains("/")) {
            MyFileUtil.hasContext(cmd)
        } else {
            val res = hql(s"select count(*) from ${cmd}").result()
            val count = res(0).getLong(0)
            count > 0
        }
    }
}
object MyCommandTranslator {
    implicit def stringToTranslator(cmd:String) = new MyCommandTranslator(cmd)

    def show(tabledata:Array[org.apache.spark.sql.Row]) = {
        tabledata.foreach( x => println(x.mkString("\t")))
    }
}
""")

def to = MyCommandTranslator
import MyCommandTranslator._

val onetable = sql("select 1 as id")
hive.registerRDDAsTable(onetable,"onetable")

def help = {
    println("""example:
        "select * from testperson" hqlgo
        "select * from testperson" hqlsaveto "somelocalfile.txt"
        "select * from testperson" hqlsaveto "hdfs:/basedir/parentdir/subdir"
        "create table sometable (name string,age int,weight double)" from "hdfs:/test/testperson"
        "sometable" isok
        "somelocalfile.txt" isok
        "hdfs:/basedir/parentdir/subdir" isok
        val data = "select * from testperson" hqlresult
        to show data
        val somerdddata = sc.textFile("hdfs:/test/testperson")
        "somerdddata" defineas "name string,age int,weight double"
        "select * from somerdddata" hqlgo
        if you want to see the help of enveronment, please type :help
        """)
}



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值