HNU大数据并行处理系统（二）MapReduce任务

江潭落月复西斜

已于 2022-09-04 19:42:43 修改

阅读量697

点赞数

分类专栏：小学期文章标签：大数据 mapreduce hadoop

于 2022-08-29 15:46:57 首次发布

本文链接：https://blog.csdn.net/qq_45785060/article/details/126586001

版权

小学期专栏收录该内容

11 篇文章 1 订阅

订阅专栏

一、数据去重
首先下载好idea和jdk，网上搜一搜如何配置环境
在主机上下载和虚拟机上一样的Hadoop，bin进行替换，这部分详见↓
idea如何连接Hadoop
只需要看连接hdfs的部分，连上后的测试不用看
注意，连上之后代码框的右上角会出现一个循环状的图案，点击进行导入依赖（大概出现在图上的位置，当时忘记截图了）
在这里插入图片描述
代码参考
我的框架

在虚拟机上上传file1.txt和file2.txt，做了前一个实验应该知道怎么建文件夹并上传到集群

在这里插入图片描述

注意代码有修改，地址是这样得到的

 FileInputFormat.addInputPath(job, new Path("hdfs://master:9000/input1/"));
 FileOutputFormat.setOutputPath(job, new Path("hdfs://master:9000/output1/"));

run，报错
Permission denied: user=root, access=WRITE, inode=“/user“:hdfs:supergroup:drwxr-xr-x
在Hadoop文件的hdfs-site.xml加上

    　　<property>
    　　　　<name>dfs.permissions</name>
    　　　　<value>false</value>
    　　</property>

忘记是idea还是虚拟机的要加了，反正两个都修改也没事
现在ok了
在这里插入图片描述
二、数据排序

代码参考
我的框架
在这里插入图片描述
三、平均成绩

package lab1.task3;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class calcGPA {
    public calcGPA() {
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        String fileAddress1 = "hdfs://master:9000/input3/";
        String fileAddress2 = "hdfs://master:9000/";
        //String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        String[] otherArgs = new String[]{fileAddress1+"math.txt", fileAddress1+"china.txt", fileAddress1+"english.txt", fileAddress2+"output3"};
        if(otherArgs.length < 2) {
            System.err.println("Usage: calcGPA <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "calc GPA");
        job.setJarByClass(calcGPA.class);
        job.setMapperClass(calcGPA.TokenizerMapper.class);
        job.setCombinerClass(calcGPA.IntSumReducer.class);
        job.setReducerClass(calcGPA.IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        for(int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true)?0:1);
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        public IntSumReducer() {
        }

        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            int count = 0;

            IntWritable val;
            for(Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get(),count++) {
                val = (IntWritable)i$.next();
            }


            int average = (int)sum/count;
            context.write(key, new IntWritable(average));
        }
    }


    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {

        public TokenizerMapper() {
        }

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString(), "\n");

            while(itr.hasMoreTokens()) {
                StringTokenizer iitr = new StringTokenizer(itr.nextToken());
                String name = iitr.nextToken();
                String score = iitr.nextToken();
                context.write(new Text(name), new IntWritable(Integer.parseInt(score)));
            }

        }
    }
}

在这里插入图片描述
四、单表关联

package lab1.task4;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class STjoin {
    public static int time = 0; public static class Map extends Mapper<Object, Text, Text, Text>{
        public void map(Object key,Text value,Context context)throws IOException,InterruptedException{
            String relationtype = new String();
            String line = value.toString();
            System.out.println("mapper...............");
            int i = 0;
            //遍历方法二：使用迭代器取出child和parent
            String[] values = new String[10];
            StringTokenizer itr = new StringTokenizer(line);
            while(itr.hasMoreTokens()){
                values[i] = itr.nextToken();
                i = i+1;
            }

            System.out.println("child："+values[0]+"  parent:"+values[1]);
            if(values[0].compareTo("child") != 0){//如果是child，则为0，否则为-1

                relationtype="1";
                context.write(new Text(values[1]),new Text(relationtype+"+"+values[0]));
                System.out.println("key:"+values[1]+"  value: "+relationtype+"+"+values[0]);
                relationtype = "2";
                context.write(new Text(values[0]), new Text(relationtype+"+"+values[1]));
                System.out.println("key:"+values[0]+"  value: "+relationtype+"+"+values[1]);
            }
        }
    }

    public static class Reduce extends Reducer<Text, Text, Text, Text>{
        public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
            System.out.println("reduce.....................");
            System.out.println("key:"+key+"  values:"+values);
            if(time==0){
                context.write(new Text("grandchild"), new Text("grandparent"));
                time++;
            }
            int grandchildnum = 0;
            String grandchild[] = new String[10];
            int grandparentnum = 0;
            String grandparent[] = new String[10];

            String name = new String();
            //遍历方法二：用for循环
            for(Text val : values){
                //    String record = ite.next().toString();
                String record = val.toString();
                System.out.println("record: "+record);

                int i = 2;
                char relationtype = record.charAt(0);
                name = record.substring(i);

                System.out.println("name: "+name);

                if (relationtype=='1') {
                    grandchild[grandchildnum] = name;
                    grandchildnum++;
                }
                else{
                    grandparent[grandparentnum]=name;
                    grandparentnum++;
                }
            }
            //遍历方法三：就是详细方法的charAt()，一个一个字符遍历
            if(grandparentnum!=0&&grandchildnum!=0){
                for(int m = 0 ; m < grandchildnum ; m++){
                    for(int n = 0 ; n < grandparentnum; n++){
                        context.write(new Text(grandchild[m]), new Text(grandparent[n]));
                        System.out.println("grandchild: "+grandchild[m]+"  grandparent: "+grandparent[n]);
                    }
                }
            }
        }
    }
    public static void main(String [] args)throws Exception{
        Configuration conf = new Configuration();
        Job job = new Job(conf,"single table join");
        job.setJarByClass(STjoin.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path("hdfs://master:9000/input4/"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/output4/"));

        System.exit(job.waitForCompletion(true)? 0 : 1);
    }
}

在这里插入图片描述

江潭落月复西斜

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
HNU大数据并行处理系统（二）MapReduce任务

注意，连上之后代码框的右上角会出现一个循环状的图案，点击进行导入依赖（大概出现在图上的位置，当时忘记截图了）在虚拟机上上传file1.txt和file2.txt，做了前一个实验应该知道怎么建文件夹并上传到集群。在主机上下载和虚拟机上一样的Hadoop，bin进行替换，这部分详见↓。忘记是idea还是虚拟机的要加了，反正两个都修改也没事。在Hadoop文件的hdfs-site.xml加上。只需要看连接hdfs的部分，连上后的测试不用看。注意代码有修改，地址是这样得到的。............
复制链接

扫一扫

专栏目录