MapReduce的单词统计从本地运行到集群运行

基于windows已配置号hadoop客户端,本次使用的事hadoop3.1.3版本

数据:

 As soon as an abnormally elevated IOP was encountered, patient was excluded from the study
and the prior regimen was reestablished. RESULTS This study included 53 eyes of 53
 patients with open angle glaucoma. Twenty-seven patients suffered from primary open
 angle glaucoma and 26 patients had pseudoexfoliative glaucoma. After beginning the
 second phase of the study, a mild trend of increasing IOP was recordable. A corresponding
 trend was even detected in female and male patients separately. The P values at week 
In the first 2 weeks after initiation of the 2nd phase, 66% of cases have no change 
in IOPs, but thereafter, 69.8%, experienced increasing IOPs.The present
study shows the superiority of the conventional dosage of Latanoprost 0.005% in 
comparison with once every other day dose but at least in first few weeks, the 
IOPs are reasonably close to each other. Further studies with higher number of 
cases would widen the present findings.

#maper代码

package com.pw.study.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private final Text outKey = new Text();
    private final IntWritable outValue = new IntWritable();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        
        //1.读取一行
        String line = value.toString();
        //2.切割
        String[] values = line.split(" ");
        for (String item : values) {
            //每一个单词记录一下
            outKey.set(item);
            outValue.set(1);
            context.write(outKey, outValue);
        }
    }
}

#reducer代码

package com.pw.study.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    //输出内容
    private final IntWritable outValue = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //累计求和
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        outValue.set(sum);
        context.write(key, outValue);
    }
}

# wcDriver 主类,本地直接启动(本地运行)得到结果


import java.io.IOException;


public class WCDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // 启动mrJob
        Job job = Job.getInstance(new Configuration());
        //设置主启动类

        /*设置map实现类*/
        job.setMapperClass(WCMapper.class);
        /*设置reduce实现类*/
        job.setReducerClass(WCReducer.class);

        //设置map输出key类型
        job.setMapOutputKeyClass(Text.class);
        //设置map输出value类型
        job.setMapOutputValueClass(IntWritable.class);

        //设置最终输出key
        job.setOutputKeyClass(Text.class);
        //设置最终输出value
        job.setOutputValueClass(IntWritable.class);


        //文件输入
        FileInputFormat.setInputPaths(job, new Path("E:\\work\\note\\iodata\\input"));
        //最终结果输出
        FileOutputFormat.setOutputPath(job, new Path("E:\\work\\note\\iodata\\out3"));

        //执行job
        job.waitForCompletion(true);
    }
}

#hdfs运行,在本地将Wordcount项目打包,将jar上传到hdfs的节点上,并将输入文件上传到/input中。然后运行如下代码,由于我打包时没有你配置主类所以包名一起输入,其他的和官网的一致,

hadoop jar./MR_WC.jar com.pw.study.wordcount.WCDriverForHDFSAndSingle /input  /output
package com.pw.study.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * hdfs 单机运行
 * 设置主启类
 * 2.将文件路劲输入修改为入参
 */
public class WCDriverForHDFSAndSingle {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // 启动mrJob
        Job job = Job.getInstance(new Configuration());
        //设置主启动类
        job.setJarByClass(WCDriverForHDFSAndSingle.class);

        /*设置map实现类*/
        job.setMapperClass(WCMapper.class);
        /*设置reduce实现类*/
        job.setReducerClass(WCReducer.class);

        //设置map输出key类型
        job.setMapOutputKeyClass(Text.class);
        //设置map输出value类型
        job.setMapOutputValueClass(IntWritable.class);

        //设置最终输出key
        job.setOutputKeyClass(Text.class);
        //设置最终输出value
        job.setOutputValueClass(IntWritable.class);


        //文件输入
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //最终结果输出
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //执行job
        job.waitForCompletion(true);
    }
}

#本地连接hdfs集群,配置参数然后运行,设置环境变量

 注意的地方是5.6步先打包(打包是还没有设置jar的代码)

package com.pw.study.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * hdfs 集群启动
 * 1.配置参数
 * //设置在集群运行的相关参数-设置HDFS,NAMENODE的地址
 * configuration.set("fs.defaultFS", "hdfs://hadoop102:8020");
 * //指定MR运行在Yarn上
 * configuration.set("mapreduce.framework.name","yarn");
 * //指定MR可以在远程集群运行
 * configuration.set(
 * "mapreduce.app-submission.cross-platform","true");
 * //指定yarn resourcemanager的位置
 * configuration.set("yarn.resourcemanager.hostname",
 * "hadoop103")
 * 2.设置主启类
 * 3.将文件路劲输入修改为入参
 * 4.配置环境信息(就是在IDE的运行参数配置)
 * 5.打包完成后注释主启动类
 * 6.设置jar包绝对路径
 * */
public class WCDriverForHDFSAndCluster {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        //设置在集群运行的相关参数-设置HDFS,NAMENODE的地址
        conf.set("fs.defaultFS", "hdfs://hadoop102:8020");
        //指定MR运行在Yarn上
        conf.set("mapreduce.framework.name", "yarn");
        //指定MR可以在远程集群运行
        conf.set("mapreduce.app-submission.cross-platform", "true");
        //指定yarn resourcemanager的位置
        conf.set("yarn.resourcemanager.hostname", "hadoop103");

        // 启动mrJob
        Job job = Job.getInstance(conf);
        //设置主启动类
        // job.setJarByClass(WCDriverForHDFSAndCluster.class);

        //设置jar的绝对路径并注释掉主启动类
        job.setJar("E:\\code\\hadoop_code\\MRDemo1_wordCount\\target\\MRDemo1_wordCount-1.0-SNAPSHOT.jar");
        /*设置map实现类*/
        job.setMapperClass(WCMapper.class);
        /*设置reduce实现类*/
        job.setReducerClass(WCReducer.class);

        //设置map输出key类型
        job.setMapOutputKeyClass(Text.class);
        //设置map输出value类型
        job.setMapOutputValueClass(IntWritable.class);

        //设置最终输出key
        job.setOutputKeyClass(Text.class);
        //设置最终输出value
        job.setOutputValueClass(IntWritable.class);


        //文件输入
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //最终结果输出
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //执行job
        job.waitForCompletion(true);
    }
}

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
mapreduce单词统计的流程包括以下几个步骤: 1. 准备数据:从本地文件系统或者分布式文件系统(HDFS)中获取需要统计的文本数据。 2. 编程规范:按照MapReduce编程模型的规范,编写Mapper和Reducer的核心处理逻辑。 3. Map阶段:在Mapper中,对输入的文本数据进行切分和处理,将每个单词作为键,出现的次数作为值进行输出。 4. Reduce阶段:在Reducer中,对Mapper输出的键值对进行聚合和计算,将相同的单词进行合并,并计算出总的出现次数。 5. 组合Job:将编写好的Mapper和Reducer进行组合,形成一个完整的Job,用于提交到MapReduce框架中进行执行。 6. 设置和运行Job:对Job进行一些必要的设置,如指定输入路径、输出路径、Mapper和Reducer的类等。然后运行Job,让MapReduce框架执行整个统计任务。 综上所述,mapreduce单词统计的流程包括准备数据、编程规范、Map阶段、Reduce阶段、组合Job和设置与运行Job这几个步骤。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *3* [MapReduce统计单词数目详细说明](https://blog.csdn.net/ygp12345/article/details/109035195)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *2* [Hadoop实战大数据大作业](https://download.csdn.net/download/qq_50807624/85580175)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值