file:///home/text1.txt 则从本地读
hdfs://clusterA/direct1/text1.txt 从集群上读
import java.io.OutputStreamWriter
/**
* 读取hdfs文件
*
* @param aPath 要读取的文件路径,如hdfs://clusterA/direct1/text1.txt"
* @return
*/
val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
val pathArr = aPath.split("//")
val uri = pathArr.head + "//" + pathArr(1).split("/").head
val path = new Path(aPath)
val hdfs = org.apache.hadoop.fs.FileSystem.get(
new java.net.URI(uri),
new org.apache.hadoop.conf.Configuration())
if (hdfs.exists(path))
val outpu=sc.sparkContext.textFile(aPath).collect()
/**
* 删除hdfs目录
*
* @param aPath 要删除的路径,如hdfs://clusterA/direct1"
* @return
*/
def deleteHdfsPath(aPath: String) = {
val pathArr = aPath.split("//")
val uri = pathArr.head + "//" + pathArr(1).split("/").head
val path = new Path(aPath)
val hdfs = org.apache.hadoop.fs.FileSystem.get(
new java.net.URI(uri),
new org.apache.hadoop.conf.Configuration())
if (hdfs.exists(path))
hdfs.delete(path, true)
}
/**
* 获取文本变更时间
*
* @param aPath 要获取的文本,如hdfs://clusterA/direct1/file.txt"
* @return
*/
def getMdfTime(aPath: String) = {
val pathArr = aPath.split("//")
val uri = pathArr.head + "//" + pathArr(1).split("/").head
val path = new Path(aPath)
val hdfs = org.apache.hadoop.fs.FileSystem.get(
new java.net.URI(uri),
new org.apache.hadoop.conf.Configuration())
if (hdfs.exists(path))
{ val fileSt = hdfs.getFileStatus(new Path(path))
val modTime = fileSt.getModificationTime.toString\\毫秒时间戳 Long型
}
}
/**
* 支持中文的hdfs文件写入-覆盖模式
*
* @param aPath 要写如的文本路径,如hdfs://clusterA/direct1/file.txt"
* @param content 要写入的文本
* @return
*/
def writeOverwrite(aPath: String, content: Iterator[String], hdfs: FileSystem) = {
//支持中文
val path = new Path(aPath)
val out = new OutputStreamWriter(hdfs.create(path, true)) //true则overwrite,false的话路径存在会报错
content.foreach(str => out.write(str + "\n"))
out.flush()
out.close()
}
/**
* 支持中文的hdfs文件写入-追加模式
*
* @param aPath 要写如的文本路径,如hdfs://clusterA/direct1/file.txt"
* @param content 要写入的文本
* @return
*/
def writeAppend( filename : String, content : Iterator[ String ],hdfs:FileSystem ) = {
//支持中文
val path = new Path( filename )
var fileOutputStream:FSDataOutputStream= null
try
if (hdfs.exists(path)) {
fileOutputStream = hdfs.append(path)
content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
}
else {
fileOutputStream = hdfs.create(path)
content.foreach(x=>fileOutputStream.write((x+"\n").getBytes()))
}
finally {
if (fileOutputStream != null) fileOutputStream.close
}
}
/**
* 保存dataframe到指定table
*
* @param aPath
* @return
*/
def saveDF(sqlContext: HiveContext, tableNme: String, hdfsPath: String, DF: DataFrame, day: String, hour: String, numPartitions: Int): Unit = {
//方法1
val savePath = hdfsPath + "/pt_d=" + day + "/pt_h=" + hour
val sqlcode = "alter table " + tableNme + " add if not exists partition (pt_d='" + day + "', pt_h='" + hour + "')"
//覆写模式
funDeleteHdfsPath(savePath)//先删除路径下文件
DF.repartition(numPartitions).write.format("orc").save(savePath)
//追加模式//DF.repartition(numPartitions).write.mode("append").format("orc").save(savePath)
sqlContext.sql("use biads")
sqlContext.sql(sqlcode)
//方法2
val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
DF.createOrReplaceTempView(tempViewName)
sc.sql(s"INSERT OVERWRITE TABLE biads.${logTableName} PARTITION (pt_d='${timeD}',pt_h='${timeH}',pt_channel='${pt_channel}') select * from ${tempViewName}")//若想追加写表则,OVERWRITE改为 INTO
outString
//方法3
val DATASCHEMA=Array("c1","c2")//和创建的hive表字段名称保持一致
val savePath = path + "/pt_d=" + date + "/pt_h=" + hour + "/pt_min=" + minute
rdd.toDF(DATASCHEMA: _*)
.coalesce(1)
.write.mode("append").format("orc").save(savePath)//.mode("append")可改.mode("overwrite")
val sqlCode = "alter table " + table + " add if not exists partition (pt_d='" + date + "', pt_h='" + hour + "', pt_min='" + minute ')"
sqlContext.sql(sqlCode)
// 方法4-动态分区
df=df.withColumn("pt_m", lit(ptm))//加入分区为pt_m
df.write.mode(SaveMode.Overwrite).insertInto(tableName)
//注意,动态分区方法需要开启一下两个参数:
// hive.exec.dynamic.partition = true
// hive.exec.dynamic.partition.mode = nonstrict
即:
sc = SparkSession.builder()
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.config("spark.sql.parquet.writeLegacyFormat", "true")
.config("spark.sql.sources.partitionOverwriteMode", "dynamic")
.enableHiveSupport().getOrCreate()
}
rdd输出一个文件
yourRDD.coalesce(1).saveAsTextFile("outputPath")