Spark多文件输出的两种形式

一只咸鱼va

已于 2022-10-26 22:34:44 修改

阅读量840

点赞数 2

分类专栏： spark 文章标签： spark scala 大数据

于 2022-10-26 22:31:55 首次发布

本文链接：https://blog.csdn.net/YYLong0/article/details/127542206

版权

spark 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

#所需环境版本
jdk=1.8
scala.version=2.11.0
spark.version=2.3.2
hadoop.verison=2.7.2

import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer

object Main {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("test")
      .setMaster("local[2]")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[IntWritable], classOf[Text]))
    val sc = new SparkContext(sparkConf)
    process(sc)
    sc.stop()
  }

  def process(sc: SparkContext) = {
    val output = "output"
    val buffer = new ArrayBuffer[(String,String)]
    for(i <- 1 to 1000){
      val key = (Math.random() * 10).toInt.toString
      buffer +=((key,s"value_${key}"))
    }
sc.makeRDD(buffer).saveAsHadoopFile(output,classOf[String],classOf[String],classOf[RDDMultipleTextOutputFormat])
    val output1 = "output1"
    val buffer1 = new ArrayBuffer[(IntWritable,Text)]
    for(i <- 1 to 1000){
      val key = (Math.random() * 10).toInt
      buffer1 +=((new IntWritable(key),new Text(s"value_${key}")))
    }
 sc.makeRDD(buffer1).saveAsNewAPIHadoopFile(output1,classOf[IntWritable],classOf[Text],classOf[MultipleFileOutputFormat])
  }
}

import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[String, String]{
  override def generateFileNameForKeyValue(key:String, value:String, name:String): String ={
    key
  }
}

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;

public class MultipleFileOutputFormat extends FileOutputFormat {
    private static final Map<Integer, String> FILENAMES = new HashMap<Integer, String>(){
        {
            put(0, "file0");
            put(1, "file1");
            put(2, "file2");
            put(3, "file3");
            put(4,"file4");
            put(5, "file5");
            put(6, "file6");
            put(7, "file7");
            put(8, "file8");
            put(9, "file9");
        }
    };
    protected static final String CODESET = "utf-8";
    protected static final byte[] newline;
    static {
        try {
            newline = "\n".getBytes(CODESET);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("failed to get newline bytes, ", e);
        }
    }
    public Path getPathForWorkFile(TaskAttemptContext context, String prefix) throws IOException {
        FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
        return new Path(committer.getWorkPath(), prefix);
    }
    @Override
    public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException {

        final TaskAttemptContext myJob = job;
        final Configuration conf = job.getConfiguration();
        final FileSystem fs = FileSystem.get(conf);

        return new RecordWriter<IntWritable, Text>() {
            LineRecorderWriter[] outWriter = new LineRecorderWriter[FILENAMES.size()];
            @Override
            public void write(IntWritable key, Text value) throws IOException, InterruptedException {
                if(outWriter[key.get()] == null){
                    Path filePath = getPathForWorkFile(myJob, FILENAMES.get(key.get()));
                    outWriter[key.get()] = new LineRecorderWriter(fs, filePath);
                }
                outWriter[key.get()].write(key, value);
            }
            @Override
            public void close(TaskAttemptContext context) throws IOException, InterruptedException {
                for(LineRecorderWriter writer : outWriter){
                    if(writer != null)
                        writer.close(context);
                }
            }
        };
    }

    public static class LineRecorderWriter extends RecordWriter<IntWritable, Text> {
        FSDataOutputStream out;

        public LineRecorderWriter(FileSystem fs, Path path) throws IOException {
            out = fs.create(path);
        }
        @Override
        public void write(IntWritable key, Text value) throws IOException, InterruptedException {
            out.write(value.toString().getBytes(CODESET));
            out.write(newline);
        }
        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            out.close();
        }
    }
}