package com.lyg.core
import org.apache.spark.SparkContext
import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD, RDD}
import org.apache.spark.sql.SparkSession
/**
* ClassName: ReadDataToDirectory
* Author: yage.liu
* Date: 2016年3月01日 16:25
* Version: V1.0
* Description: spark 处理当前数据所在的文件名称和目录
**/
object ReadDataToDirectory {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("ReadDataToDirectory")
.master("local")
.getOrCreate()
val sc = spark.sparkContext
val inputPath = "D:\\src\\sparkDatas\\aggregateData.txt"
val resultDatas1:RDD[(String,String)] = readMapreduce(sc,inputPath)
resultDatas1.foreach(println)
println("--------------------------------------------------------------------------------------------")
val resultDatas2:RDD[(String,String)] = readMapred(sc,inputPath)
resultDatas2.foreach(println)
}
def readMapreduce(sc:SparkContext,inputPath:String): RDD[(String,String)] ={
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, TextInputFormat}
val hdfsFileRdd = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](inputPath)
val hadoopHdfsFileRDD = hdfsFileRdd.asInstanceOf[NewHadoopRDD[LongWritable, Text]]
val resultDatas:RDD[(String,String)] = hadoopHdfsFileRDD.mapPartitionsWithInputSplit((inputSplit:InputSplit, iterator:Iterator[(LongWritable, Text)]) => {
val file = inputSplit.asInstanceOf[FileSplit]
iterator.map(line => {(file.getPath.toString(),line._2.toString)})
})
resultDatas
}
def readMapred(sc:SparkContext,inputPath:String): RDD[(String,String)] ={
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.{FileSplit, InputSplit, TextInputFormat}
import org.apache.spark.rdd.HadoopRDD
val hdfsFileRdd = sc.hadoopFile[LongWritable, Text, TextInputFormat](inputPath)
val hadoopHdfsFileRDD = hdfsFileRdd.asInstanceOf[HadoopRDD[LongWritable, Text]]
val resultDatas:RDD[(String,String)] = hadoopHdfsFileRDD.mapPartitionsWithInputSplit((inputSplit:InputSplit,iterator:Iterator[(LongWritable, Text)]) => {
val file = inputSplit.asInstanceOf[FileSplit]
iterator.map(line => {(file.getPath.toString(),line._2.toString)})
})
resultDatas
}
}
spark 处理当前数据所在的文件名称和目录
最新推荐文章于 2024-07-18 06:20:28 发布