从HDFS中读入,并定义内存表person:"create table person (name string,age int,weight double)" from "hdfs:/test/testperson"
用sql语句计算查询,结果直接写入文件(可以是本地可以是hdfs):"select avg(age) from person" hqlsaveto "averageage.txt"
让spark处理数据更简单(此版本需要用到HiveContext,需要重新编译,不用编译的SQLContext版本见spark定制之六,程序7月10日更新):
import org.apache.spark.sql.SchemaRDD
var FIELD_SEPERATOR = "\t"
var RECORD_SEPERATOR = "\n"
var lastrdd : SchemaRDD = null
object MyFileUtil extends java.io.Serializable {
import org.apache.hadoop.fs.Path
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.FileStatus
import scala.collection.mutable.ListBuffer
def regularFile(filepath:String):String = {
if(filepath == "") {
filepath;
} else if(filepath.startsWith("hdfs:")) {
filepath
} else if(filepath.startsWith("file:")) {
filepath
} else if(filepath.startsWith("/")) {
"file://" + filepath
} else {
val workdir = System.getProperty("user.dir")
"file://" + workdir + "/" + filepath
}
}
var SAFEMINPATH_LENGTH : Int = 24
def getFileSystem(filepath:String) = {
if(filepath.startsWith("hdfs:")) {
FileSystem.get(new org.apache.hadoop.conf.Configuration());
} else if(filepath.startsWith("file:")) {
FileSystem.getLocal(new org.apache.hadoop.conf.Configuration());
} else {
throw new Exception("file path invalid")
}
}
def deletePath(filepath:String) = {
if(filepath.length < SAF