排序专题

最新推荐文章于 2024-06-20 18:22:05 发布

我要变成万人迷

最新推荐文章于 2024-06-20 18:22:05 发布

阅读量132

点赞数

本文链接：https://blog.csdn.net/qq_43668119/article/details/105425787

版权

1.部分排序
每个分区的key是有序的，但是整体无序（分区与分区之间是无序的）
2.全排序
解决方案：
1.设置reducer的个数为1.
2.自定义的分区规则进行分区。
3.随机抽样。
：由于数据的不均衡或者数据的随时调整，很有可能在原有分区规则基础上产生数据倾斜问题。会让某个reduce节点的计算量远大于其他节点的计算量，影响整体计算的性能。
为了解决上面的问题，所以采用随机抽样的方式，动态的选择临界点作为分区的依据。
3.二次排序
如果一个字段不能满足条件（一般就是这个字段相等）的时候，会继续使用其他字段进行排序。不仅仅限制只能排
4.倒叙排序
案列：
源数据：
a.txt a,a,b,c,b,c
b.txt x,b,s,b,e
c.txt t,y,n,c,b,a
思路：
1.拆分每个文件里的单词作为Key，这个单词所在的文件为value
2.按照单词相同的，文件名做合并。字符串拼接。

package reverseIndex;

import mapReduce.wordCount.common.SubmitJobUtil;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class ReverseIndexApp {
    public static class riMapper extends Mapper<LongWritable,Text,Text,Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString ();
            String[] words = line.split (" ");
            //通过上下文获取输入切片。
            FileSplit split =(FileSplit)context.getInputSplit ();
            //通过切片获得路径下的文件名。
            String FileName = split.getPath ().getName ();
            for(String word:words){
                context.write (new Text(word),new Text(FileName));
            }
        }
    }

    public static class riReducer extends Reducer<Text,Text,Text,Text>{

        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> fileNameList = new ArrayList<String> ();
            StringBuffer sb = new StringBuffer ();
           //如果当前这个文件名在列表中存在，就属于重复的文件名
            //判断文件名是否在列表中存在，如果不存在才加入列表以及追加字符串。
            for (Text FileName:values){
                if(!fileNameList.contains (FileName.toString ())){
                    sb.append (FileName.toString ()+",");
                    fileNameList.add(FileName.toString ());
                }

            }
            //所有的文件名进行拼接会有一个问题，最后会多出一个“，”号
            //去掉最后的逗号。
            String fileNames = sb.toString ();
            System.out.println ("======================="+fileNames);
            fileNames = fileNames.substring (0,fileNames.lastIndexOf (","));
            context.write (new Text (key),new Text(fileNames));
        }
    }

    public static void main(String[] args) throws Exception {
        SubmitJobUtil.submitJob (ReverseIndexApp.class,args);
    }
}

package mapReduce.wordCount.common;

import mapReduce.wordCount.WordCountApp;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.net.URI;

public class SubmitJobUtil {

    public static void submitJob(Class driverClass, String[] args) throws Exception {
        Job job = Job.getInstance();

      Class[] innerClasses = driverClass.getClasses();
        job.setJarByClass(driverClass);
        for(Class innerClass:innerClasses){
            //获取当前内部类父类的所有泛型
            ParameterizedType paraType = (ParameterizedType)innerClass.getGenericSuperclass();
            //把获得的所有泛型参数化，变成数组的形式方便使用
            Type[] types = paraType.getActualTypeArguments();
            //获得Mapper或者reducer类的输出key的类型。
            Type outKeyType = types[2];
            //获得Mapper或者reducer类的输出value的类型
            Type outValueType = types[3];
            //判断当前的内部类中哪一个类是mapper，哪一个是reducer。
            if (Mapper.class.isAssignableFrom(innerClass)){
                job.setMapperClass(innerClass);
                job.setMapOutputKeyClass(Class.forName (outKeyType.getTypeName()));
                job.setMapOutputValueClass(Class.forName (outValueType.getTypeName ()));
            }else if(Reducer.class.isAssignableFrom(innerClass)){
                job.setReducerClass(innerClass);
                job.setOutputKeyClass(Class.forName (outKeyType.getTypeName()));
                job.setOutputValueClass(Class.forName (outValueType.getTypeName ()));
            }
        }

        Path outpath = new Path(args[1]);
        FileSystem fs = FileSystem.get(new URI(outpath.toString()),new org.apache.hadoop.conf.Configuration());
        if(fs.exists(outpath)){
            fs.delete(outpath,true);
        }

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);

    }
}