linux-大数据库实验5 文本复制

最新推荐文章于 2023-05-25 08:55:58 发布

qq_53474735

最新推荐文章于 2023-05-25 08:55:58 发布

阅读量161

点赞数

文章标签： linux 数据库 hadoop

本文链接：https://blog.csdn.net/qq_53474735/article/details/130382686

版权

1-----------------------------------------

2-----------------------------------------------

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MergeSort {

   /**
   * @param args
   * 输入多个文件，每个文件中的每行内容均为一个整数
   * 输出到一个新的文件中，输出的数据格式为每行两个整数，第一个数字为第二个整数的排序位次，第二个整数为原待排列的整数
   */
   //map函数读取输入中的value，将其转化成IntWritable类型，最后作为输出key
   public static class Map extends Mapper<Object, Text, IntWritable, IntWritable>{

       private static IntWritable data = new IntWritable();
       public void map(Object key, Text value, Context context) throws IOException,InterruptedException{
           String text = value.toString();
           data.set(Integer.parseInt(text));//将括号内容复制给data对象
           context.write(data, new IntWritable(1));//括号内容作为中间结果扔出去交给shuffle处理
       }
   }

   //reduce函数将map输入的key复制到输出的value上，然后根据输入的value-list中元素的个数决定key的输出次数,定义一个全局变量line_num来代表key的位次
   public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
       private static IntWritable line_num = new IntWritable(1);

       public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException{
           for(IntWritable val : values){
               context.write(line_num, key);
               line_num = new IntWritable(line_num.get() + 1);
           }
       }
   }

   //自定义Partition函数，此函数根据输入数据的最大值和MapReduce框架中Partition的数量获取将输入数据按照大小分块的边界，然后根据输入数值和边界的关系返回对应的Partiton ID
   public static class Partition extends Partitioner<IntWritable, IntWritable>{
       public int getPartition(IntWritable key, IntWritable value, int num_Partition){
           int Maxnumber = 65223;//int型的最大数值
           int bound = Maxnumber/num_Partition+1;
           int keynumber = key.get();//从key的序列类型转换成int类型
           for (int i = 0; i<num_Partition; i++){
               if(keynumber<bound * (i+1) && keynumber>=bound * i){
                   return i;
               }
           }
           return -1;// 表示返回一个代数值，一般用在子函数结尾。按照程序开发的一般惯例，表示该函数失败；
       }
   }

   public static void main(String[] args) throws Exception{
       // TODO Auto-generated method stub
       Configuration conf = new Configuration();//程序运行时的参数
       conf.set("fs.default.name","hdfs://localhost:9000");
       String[] otherArgs = new String[]{"input","output"}; /* 直接设置输入参数 */
       if (otherArgs.length != 2) {
           System.err.println("Usage: wordcount <in><out>");
           System.exit(2);
           }
       Job job = Job.getInstance(conf,"Merge and sort");//设置环境参数
       job.setJarByClass(MergeSort.class);//设置整个程序的类名
       job.setMapperClass(Map.class);//添加Mapper类
       job.setReducerClass(Reduce.class);//添加Reducer类
       job.setPartitionerClass(Partition.class);//添加Partitioner类
       job.setOutputKeyClass(IntWritable.class);//设置输出类型
       job.setOutputValueClass(IntWritable.class);//设置输出类型
       FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//设置输入原始文件文件路径
       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//设置输出文件路径
       //Job运行是通过job.waitForCompletion(true)，true表示将运行进度等信息及时输出给用户，false的话只是等待作业结束
       boolean result = job.waitForCompletion(true);
       System.exit(result ? 0 : 1);
   }

}

3---------------------------------

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class simple_data_mining {
   public static int time = 0;

   /**
   * @param args
   * 输入一个child-parent的表格
   * 输出一个体现grandchild-grandparent关系的表格
   */
   //Map将输入文件按照空格分割成child和parent，然后正序输出一次作为右表，反序输出一次作为左表，需要注意的是在输出的value中必须加上左右表区别标志
   public static class Map extends Mapper<Object, Text, Text, Text>{
       public void map(Object key, Text value, Context context) throws IOException,InterruptedException{
           String child_name = new String();
           String parent_name = new String();
           String relation_type = new String();
           String line = value.toString();
           int i = 0;
           while(line.charAt(i) != ' '){
               i++;
           }
           String[] values = {line.substring(0,i),line.substring(i+1)};
           if(values[0].compareTo("child") != 0){
               child_name = values[0];
               parent_name = values[1];
               relation_type = "1";//左右表区分标志
               context.write(new Text(values[1]), new Text(relation_type+"+"+child_name+"+"+parent_name));
               //左表
               relation_type = "2";
               context.write(new Text(values[0]), new Text(relation_type+"+"+child_name+"+"+parent_name));
               //右表
           }
       }
   }

   public static class Reduce extends Reducer<Text, Text, Text, Text>{
       public void reduce(Text key, Iterable<Text> values,Context context) throws IOException,InterruptedException{
           if(time == 0){ //输出表头
               context.write(new Text("grandchild"), new Text("grandparent"));
               time++;
           }
           int grand_child_num = 0;
           String grand_child[] = new String[10];
           int grand_parent_num = 0;
           String grand_parent[]= new String[10];
           Iterator ite = values.iterator();
           while(ite.hasNext()){
               String record = ite.next().toString();
               int len = record.length();
               int i = 2;
               if(len == 0) continue;
               char relation_type = record.charAt(0);
               String child_name = new String();
               String parent_name = new String();
               //获取value-list中value的child

               while(record.charAt(i) != '+'){
                   child_name = child_name + record.charAt(i);
                   i++;
               }
               i=i+1;
               //获取value-list中value的parent
               while(i<len){
                   parent_name = parent_name+record.charAt(i);
                   i++;
               }
               //左表，取出child放入grand_child
               if(relation_type == '1'){
                   grand_child[grand_child_num] = child_name;
                   grand_child_num++;
               }
               else{//右表，取出parent放入grand_parent
                   grand_parent[grand_parent_num] = parent_name;
                   grand_parent_num++;
               }
           }

           if(grand_parent_num != 0 && grand_child_num != 0 ){
               for(int m = 0;m<grand_child_num;m++){
                   for(int n=0;n<grand_parent_num;n++){
                       context.write(new Text(grand_child[m]), new Text(grand_parent[n]));
                       //输出结果
                   }
               }
           }
       }
   }
   public static void main(String[] args) throws Exception{
       // TODO Auto-generated method stub
       Configuration conf = new Configuration();
conf.set("fs.default.name","hdfs://localhost:9000");
       String[] otherArgs = new String[]{"input","output"}; /* 直接设置输入参数 */
       if (otherArgs.length != 2) {
           System.err.println("Usage: wordcount <in><out>");
           System.exit(2);
           }
Job job = Job.getInstance(conf,"Single table join");
       job.setJarByClass(simple_data_mining.class);
       job.setMapperClass(Map.class);
       job.setReducerClass(Reduce.class);
       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(Text.class);
       FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
       System.exit(job.waitForCompletion(true) ? 0 : 1);

   }
}

qq_53474735

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
linux-大数据库实验5 文本复制

/自定义Partition函数，此函数根据输入数据的最大值和MapReduce框架中Partition的数量获取将输入数据按照大小分块的边界，然后根据输入数值和边界的关系返回对应的Partiton ID。//reduce函数将map输入的key复制到输出的value上，然后根据输入的value-list中元素的个数决定key的输出次数,定义一个全局变量line_num来代表key的位次。
复制链接

扫一扫