手把手教你编写MapReduce代码并运行

MapReduce编程初体验

需求:在给定的文本文件中统计输出每一个单词出现的总次数(先在物理机上运行)

数据格式准备如下:
先在C盘创建一个aaa.txt(什么盘都行)
在里面存入:
hello,world,hadoop
hello,hive,sqoop,flume
kitty,tom,jerry,world
hadoop
然后打开你的代码编辑器(这里我用的是idea)

创建一个maven项目

在这里插入图片描述在这里插入图片描述

再配置POM文件

<?xml version="1.0" encoding="UTF-8"?>


4.0.0

<groupId>cn.itcast</groupId>
<artifactId>mapreduce</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
    <repository>
        <id>cloudera</id>
        <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
    </repository>
</repositories>
<dependencies>
    <dependency>
        <groupId>org.apache.Hadoop</groupId>
        <artifactId>Hadoop-client</artifactId>
        <version>2.6.0-mr1-cdh5.14.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.Hadoop</groupId>
        <artifactId>Hadoop-common</artifactId>
        <version>2.6.0-cdh5.14.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.Hadoop</groupId>
        <artifactId>Hadoop-hdfs</artifactId>
        <version>2.6.0-cdh5.14.0</version>
    </dependency>

    <dependency>
        <groupId>org.apache.Hadoop</groupId>
        <artifactId>Hadoop-mapreduce-client-core</artifactId>
        <version>2.6.0-cdh5.14.0</version>
    </dependency>
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.11</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>org.testng</groupId>
        <artifactId>testng</artifactId>
        <version>RELEASE</version>
    </dependency>
</dependencies>
<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.0</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
                <encoding>UTF-8</encoding>
            </configuration>
        </plugin>

        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>2.4.3</version>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>shade</goal>
                    </goals>
                    <configuration>
                        <minimizeJar>true</minimizeJar>
                    </configuration>
                </execution>
            </executions>
        </plugin>

    </plugins>
</build>

定义一个mapper类

package com.czxy;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WorldCountMap extends Mapper<LongWritable, Text,Text,LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //key指的是編號
        //key代碼裏是  這行數據的偏移量
        //value  一串圖形
        //value  zhangsan,lisi,wangwu

        //1.將value從text轉為String
        String dates=value.toString();
        //2.切分數據
        String[] splits = dates.split(",");
        //3.遍歷輸出
        for (String data : splits) {
            //輸出數據
            context.write(new Text(data),new LongWritable(1));
        }

        //zhangsan 1  輸出一次(送一次)
        //lisi     1  輸出一次(送一次)
        //wangwu   1  輸出一次(送一次)


    }
}

定义一个reducer类

package com.czxy;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WorldCountReduce extends Reducer<Text, LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        //key 数据
        //values 1
        int sum=0;
        //遍歷values
        for (LongWritable value : values) {
            sum+=value.get();
        }
        //輸出
        context.write(key,new LongWritable(sum));
    }
}

定义一个主类,用来描述job并提交job

package com.czxy;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WorldCountDriver extends Configured implements Tool {

    @Override
    public int run(String[] args) throws Exception {
        //將已經編寫好的map reduce添加到計算機框架中
        //1.實例一個job
        Job job =Job.getInstance(new Configuration(),"WordCount34");
        //2.使用job 設置讀取數據(包括數據的類型,路徑)
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("C:\\aaa.txt"));
        //3.使用job 設置MAP類(map 輸入和輸出的類型)
        job.setMapperClass(WorldCountMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //4.使用job 設置Reduce類(Reduce 輸入和輸出的類型)
        job.setReducerClass(WorldCountReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //5.使用job 設置數據的輸出路徑
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("C:\\BBB"));
        //6.返回執行狀態編號
        return job.waitForCompletion(true)?0:1;
    }

    //执行job
    public static void main(String[] args) throws Exception {
        int run = ToolRunner.run(new WorldCountDriver(), args);
        System.out.println(run);
    }
}

运行即可
运行结果如下

D:\Java\jdk1.8.0_111\jdk\bin\java.exe "-javaagent:D:\Java\IntelliJ IDEA 2018.3.5\lib\idea_rt.jar=63668:D:\Java\IntelliJ IDEA 2018.3.5\bin" -Dfile.encoding=UTF-8 -classpath D:\Java\jdk1.8.0_111\jdk\jre\lib\charsets.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\deploy.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\access-bridge-64.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\cldrdata.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\dnsns.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\jaccess.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\jfxrt.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\localedata.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\nashorn.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\sunec.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\sunjce_provider.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\sunmscapi.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\sunpkcs11.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\ext\zipfs.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\javaws.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\jce.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\jfr.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\jfxswt.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\jsse.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\management-agent.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\plugin.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\resources.jar;D:\Java\jdk1.8.0_111\jdk\jre\lib\rt.jar;E:\Sping\Demo01\target\classes;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-client\2.6.0-mr1-cdh5.14.0\Hadoop-client-2.6.0-mr1-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-common\2.6.0-cdh5.14.0\hadoop-common-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-hdfs\2.6.0-cdh5.14.0\hadoop-hdfs-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-core\2.6.0-mr1-cdh5.14.0\hadoop-core-2.6.0-mr1-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\hsqldb\hsqldb\1.8.0.10\hsqldb-1.8.0.10.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-annotations\2.6.0-cdh5.14.0\hadoop-annotations-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\com\google\guava\guava\11.0.2\guava-11.0.2.jar;E:\apache-maven-3.3.9\repository\commons-cli\commons-cli\1.2\commons-cli-1.2.jar;E:\apache-maven-3.3.9\repository\org\apache\commons\commons-math3\3.1.1\commons-math3-3.1.1.jar;E:\apache-maven-3.3.9\repository\xmlenc\xmlenc\0.52\xmlenc-0.52.jar;E:\apache-maven-3.3.9\repository\commons-httpclient\commons-httpclient\3.1\commons-httpclient-3.1.jar;E:\apache-maven-3.3.9\repository\commons-codec\commons-codec\1.4\commons-codec-1.4.jar;E:\apache-maven-3.3.9\repository\commons-io\commons-io\2.4\commons-io-2.4.jar;E:\apache-maven-3.3.9\repository\commons-net\commons-net\3.1\commons-net-3.1.jar;E:\apache-maven-3.3.9\repository\commons-collections\commons-collections\3.2.2\commons-collections-3.2.2.jar;E:\apache-maven-3.3.9\repository\javax\servlet\servlet-api\2.5\servlet-api-2.5.jar;E:\apache-maven-3.3.9\repository\org\mortbay\jetty\jetty\6.1.26.cloudera.4\jetty-6.1.26.cloudera.4.jar;E:\apache-maven-3.3.9\repository\org\mortbay\jetty\jetty-util\6.1.26.cloudera.4\jetty-util-6.1.26.cloudera.4.jar;E:\apache-maven-3.3.9\repository\com\sun\jersey\jersey-core\1.9\jersey-core-1.9.jar;E:\apache-maven-3.3.9\repository\com\sun\jersey\jersey-json\1.9\jersey-json-1.9.jar;E:\apache-maven-3.3.9\repository\org\codehaus\jettison\jettison\1.1\jettison-1.1.jar;E:\apache-maven-3.3.9\repository\com\sun\xml\bind\jaxb-impl\2.2.3-1\jaxb-impl-2.2.3-1.jar;E:\apache-maven-3.3.9\repository\org\codehaus\jackson\jackson-jaxrs\1.8.3\jackson-jaxrs-1.8.3.jar;E:\apache-maven-3.3.9\repository\org\codehaus\jackson\jackson-xc\1.8.3\jackson-xc-1.8.3.jar;E:\apache-maven-3.3.9\repository\com\sun\jersey\jersey-server\1.9\jersey-server-1.9.jar;E:\apache-maven-3.3.9\repository\asm\asm\3.1\asm-3.1.jar;E:\apache-maven-3.3.9\repository\tomcat\jasper-compiler\5.5.23\jasper-compiler-5.5.23.jar;E:\apache-maven-3.3.9\repository\tomcat\jasper-runtime\5.5.23\jasper-runtime-5.5.23.jar;E:\apache-maven-3.3.9\repository\javax\servlet\jsp\jsp-api\2.1\jsp-api-2.1.jar;E:\apache-maven-3.3.9\repository\commons-el\commons-el\1.0\commons-el-1.0.jar;E:\apache-maven-3.3.9\repository\commons-logging\commons-logging\1.1.3\commons-logging-1.1.3.jar;E:\apache-maven-3.3.9\repository\log4j\log4j\1.2.17\log4j-1.2.17.jar;E:\apache-maven-3.3.9\repository\net\java\dev\jets3t\jets3t\0.9.0\jets3t-0.9.0.jar;E:\apache-maven-3.3.9\repository\org\apache\httpcomponents\httpclient\4.1.2\httpclient-4.1.2.jar;E:\apache-maven-3.3.9\repository\org\apache\httpcomponents\httpcore\4.1.2\httpcore-4.1.2.jar;E:\apache-maven-3.3.9\repository\com\jamesmurty\utils\java-xmlbuilder\0.4\java-xmlbuilder-0.4.jar;E:\apache-maven-3.3.9\repository\commons-lang\commons-lang\2.6\commons-lang-2.6.jar;E:\apache-maven-3.3.9\repository\commons-configuration\commons-configuration\1.6\commons-configuration-1.6.jar;E:\apache-maven-3.3.9\repository\commons-digester\commons-digester\1.8\commons-digester-1.8.jar;E:\apache-maven-3.3.9\repository\commons-beanutils\commons-beanutils\1.7.0\commons-beanutils-1.7.0.jar;E:\apache-maven-3.3.9\repository\commons-beanutils\commons-beanutils-core\1.8.0\commons-beanutils-core-1.8.0.jar;E:\apache-maven-3.3.9\repository\org\slf4j\slf4j-api\1.7.5\slf4j-api-1.7.5.jar;E:\apache-maven-3.3.9\repository\org\slf4j\slf4j-log4j12\1.7.5\slf4j-log4j12-1.7.5.jar;E:\apache-maven-3.3.9\repository\org\codehaus\jackson\jackson-core-asl\1.8.8\jackson-core-asl-1.8.8.jar;E:\apache-maven-3.3.9\repository\org\codehaus\jackson\jackson-mapper-asl\1.8.8\jackson-mapper-asl-1.8.8.jar;E:\apache-maven-3.3.9\repository\org\apache\avro\avro\1.7.6-cdh5.14.0\avro-1.7.6-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\com\thoughtworks\paranamer\paranamer\2.3\paranamer-2.3.jar;E:\apache-maven-3.3.9\repository\org\xerial\snappy\snappy-java\1.0.4.1\snappy-java-1.0.4.1.jar;E:\apache-maven-3.3.9\repository\com\google\protobuf\protobuf-java\2.5.0\protobuf-java-2.5.0.jar;E:\apache-maven-3.3.9\repository\com\google\code\gson\gson\2.2.4\gson-2.2.4.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-auth\2.6.0-cdh5.14.0\hadoop-auth-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\directory\server\apacheds-kerberos-codec\2.0.0-M15\apacheds-kerberos-codec-2.0.0-M15.jar;E:\apache-maven-3.3.9\repository\org\apache\directory\server\apacheds-i18n\2.0.0-M15\apacheds-i18n-2.0.0-M15.jar;E:\apache-maven-3.3.9\repository\org\apache\directory\api\api-asn1-api\1.0.0-M20\api-asn1-api-1.0.0-M20.jar;E:\apache-maven-3.3.9\repository\org\apache\directory\api\api-util\1.0.0-M20\api-util-1.0.0-M20.jar;E:\apache-maven-3.3.9\repository\org\apache\curator\curator-framework\2.7.1\curator-framework-2.7.1.jar;E:\apache-maven-3.3.9\repository\com\jcraft\jsch\0.1.42\jsch-0.1.42.jar;E:\apache-maven-3.3.9\repository\org\apache\curator\curator-client\2.7.1\curator-client-2.7.1.jar;E:\apache-maven-3.3.9\repository\org\apache\curator\curator-recipes\2.7.1\curator-recipes-2.7.1.jar;E:\apache-maven-3.3.9\repository\com\google\code\findbugs\jsr305\3.0.0\jsr305-3.0.0.jar;E:\apache-maven-3.3.9\repository\org\apache\htrace\htrace-core4\4.0.1-incubating\htrace-core4-4.0.1-incubating.jar;E:\apache-maven-3.3.9\repository\org\apache\zookeeper\zookeeper\3.4.5-cdh5.14.0\zookeeper-3.4.5-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\commons\commons-compress\1.4.1\commons-compress-1.4.1.jar;E:\apache-maven-3.3.9\repository\org\tukaani\xz\1.0\xz-1.0.jar;E:\apache-maven-3.3.9\repository\commons-daemon\commons-daemon\1.0.13\commons-daemon-1.0.13.jar;E:\apache-maven-3.3.9\repository\io\netty\netty\3.10.5.Final\netty-3.10.5.Final.jar;E:\apache-maven-3.3.9\repository\io\netty\netty-all\4.0.23.Final\netty-all-4.0.23.Final.jar;E:\apache-maven-3.3.9\repository\xerces\xercesImpl\2.9.1\xercesImpl-2.9.1.jar;E:\apache-maven-3.3.9\repository\xml-apis\xml-apis\1.3.04\xml-apis-1.3.04.jar;E:\apache-maven-3.3.9\repository\org\fusesource\leveldbjni\leveldbjni-all\1.8\leveldbjni-all-1.8.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-mapreduce-client-core\2.6.0-cdh5.14.0\Hadoop-mapreduce-client-core-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-yarn-common\2.6.0-cdh5.14.0\hadoop-yarn-common-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\org\apache\hadoop\hadoop-yarn-api\2.6.0-cdh5.14.0\hadoop-yarn-api-2.6.0-cdh5.14.0.jar;E:\apache-maven-3.3.9\repository\javax\xml\bind\jaxb-api\2.2.2\jaxb-api-2.2.2.jar;E:\apache-maven-3.3.9\repository\javax\xml\stream\stax-api\1.0-2\stax-api-1.0-2.jar;E:\apache-maven-3.3.9\repository\javax\activation\activation\1.1\activation-1.1.jar;E:\apache-maven-3.3.9\repository\com\sun\jersey\jersey-client\1.9\jersey-client-1.9.jar;E:\apache-maven-3.3.9\repository\com\google\inject\guice\3.0\guice-3.0.jar;E:\apache-maven-3.3.9\repository\javax\inject\javax.inject\1\javax.inject-1.jar;E:\apache-maven-3.3.9\repository\aopalliance\aopalliance\1.0\aopalliance-1.0.jar;E:\apache-maven-3.3.9\repository\com\sun\jersey\contribs\jersey-guice\1.9\jersey-guice-1.9.jar;E:\apache-maven-3.3.9\repository\com\google\inject\extensions\guice-servlet\3.0\guice-servlet-3.0.jar;E:\apache-maven-3.3.9\repository\org\testng\testng\7.0.0\testng-7.0.0.jar;E:\apache-maven-3.3.9\repository\com\beust\jcommander\1.72\jcommander-1.72.jar com.czxy2.WordCountDriver
19/11/12 15:03:37 INFO Configuration.deprecation: session.id is deprecated. Instead, use dfs.metrics.session-id
19/11/12 15:03:37 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
19/11/12 15:03:37 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
19/11/12 15:03:37 WARN mapred.JobClient: No job jar file set.  User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
19/11/12 15:03:37 INFO input.FileInputFormat: Total input paths to process : 1
19/11/12 15:03:37 INFO mapred.LocalJobRunner: OutputCommitter set in config null
19/11/12 15:03:37 INFO mapred.JobClient: Running job: job_local779279430_0001
19/11/12 15:03:37 INFO mapred.LocalJobRunner: OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
19/11/12 15:03:37 INFO mapred.LocalJobRunner: Waiting for map tasks
19/11/12 15:03:37 INFO mapred.LocalJobRunner: Starting task: attempt_local779279430_0001_m_000000_0
19/11/12 15:03:37 WARN mapreduce.Counters: Group org.apache.hadoop.mapred.Task$Counter is deprecated. Use org.apache.hadoop.mapreduce.TaskCounter instead
19/11/12 15:03:37 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
19/11/12 15:03:37 INFO mapred.MapTask: Processing split: file:/C:/SumData.txt:0+9300
19/11/12 15:03:37 INFO mapred.MapTask: Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
19/11/12 15:03:37 INFO mapred.MapTask: io.sort.mb = 100
19/11/12 15:03:37 INFO mapred.MapTask: data buffer = 79691776/99614720
19/11/12 15:03:37 INFO mapred.MapTask: record buffer = 262144/327680
19/11/12 15:03:37 INFO mapred.LocalJobRunner: 
19/11/12 15:03:37 INFO mapred.MapTask: Starting flush of map output
19/11/12 15:03:38 INFO mapred.MapTask: Finished spill 0
19/11/12 15:03:38 INFO mapred.Task: Task:attempt_local779279430_0001_m_000000_0 is done. And is in the process of commiting
19/11/12 15:03:38 INFO mapred.LocalJobRunner: 
19/11/12 15:03:38 INFO mapred.Task: Task 'attempt_local779279430_0001_m_000000_0' done.
19/11/12 15:03:38 INFO mapred.LocalJobRunner: Finishing task: attempt_local779279430_0001_m_000000_0
19/11/12 15:03:38 INFO mapred.LocalJobRunner: Map task executor complete.
19/11/12 15:03:38 WARN mapreduce.Counters: Group org.apache.hadoop.mapred.Task$Counter is deprecated. Use org.apache.hadoop.mapreduce.TaskCounter instead
19/11/12 15:03:38 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
19/11/12 15:03:38 INFO mapred.LocalJobRunner: 
19/11/12 15:03:38 INFO mapred.Merger: Merging 1 sorted segments
19/11/12 15:03:38 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 40102 bytes
19/11/12 15:03:38 INFO mapred.LocalJobRunner: 
19/11/12 15:03:38 INFO mapred.Task: Task:attempt_local779279430_0001_r_000000_0 is done. And is in the process of commiting
19/11/12 15:03:38 INFO mapred.LocalJobRunner: 
19/11/12 15:03:38 INFO mapred.Task: Task attempt_local779279430_0001_r_000000_0 is allowed to commit now
19/11/12 15:03:38 INFO output.FileOutputCommitter: Saved output of task 'attempt_local779279430_0001_r_000000_0' to C:/DDD
19/11/12 15:03:38 INFO mapred.LocalJobRunner: reduce > reduce
19/11/12 15:03:38 INFO mapred.Task: Task 'attempt_local779279430_0001_r_000000_0' done.
19/11/12 15:03:38 INFO mapred.JobClient:  map 100% reduce 100%
19/11/12 15:03:38 INFO mapred.JobClient: Job complete: job_local779279430_0001
19/11/12 15:03:38 INFO mapred.JobClient: Counters: 17
19/11/12 15:03:38 INFO mapred.JobClient:   File System Counters
19/11/12 15:03:38 INFO mapred.JobClient:     FILE: Number of bytes read=58988
19/11/12 15:03:38 INFO mapred.JobClient:     FILE: Number of bytes written=419859
19/11/12 15:03:38 INFO mapred.JobClient:     FILE: Number of read operations=0
19/11/12 15:03:38 INFO mapred.JobClient:     FILE: Number of large read operations=0
19/11/12 15:03:38 INFO mapred.JobClient:     FILE: Number of write operations=0
19/11/12 15:03:38 INFO mapred.JobClient:   Map-Reduce Framework
19/11/12 15:03:38 INFO mapred.JobClient:     Map input records=200
19/11/12 15:03:38 INFO mapred.JobClient:     Map output records=3100
19/11/12 15:03:38 INFO mapred.JobClient:     Map output bytes=33900
19/11/12 15:03:38 INFO mapred.JobClient:     Input split bytes=85
19/11/12 15:03:38 INFO mapred.JobClient:     Combine input records=0
19/11/12 15:03:38 INFO mapred.JobClient:     Combine output records=0
19/11/12 15:03:38 INFO mapred.JobClient:     Reduce input groups=51
19/11/12 15:03:38 INFO mapred.JobClient:     Reduce shuffle bytes=0
19/11/12 15:03:38 INFO mapred.JobClient:     Reduce input records=3100
19/11/12 15:03:38 INFO mapred.JobClient:     Reduce output records=51
19/11/12 15:03:38 INFO mapred.JobClient:     Spilled Records=6200
19/11/12 15:03:38 INFO mapred.JobClient:     Total committed heap usage (bytes)=514850816
0

最后输出0就是成功了,输出1就失败了
打开你的文件
在这里插入图片描述
打开part-r-00000即为最后结果
在这里插入图片描述

注意

不能二次运行相同代码,否则会报错。因为输出目录已经存在,而MapReduce要求输出目录百分百不存在,你可以再将路径修改或删除之前的目录。
在这里插入图片描述

Python是支持MapReduce编程模式的,但是Hadoop本身是用Java编写的,因此在Hadoop上运行MapReduce任务时,Java是官方推荐的开发语言。不过,为了方便其他语言的开发者,Hadoop提供了一个名为Hadoop Streaming的工具,它允许用户使用任何可以读取标准输入和输出的程序作为MapReduce作业的Mapper和Reducer。 以下是使用Python编写MapReduce程序并在Hadoop上运行的一般步骤: 1. 准备数据:首先需要准备输入数据,这些数据将被上传到HDFS(Hadoop分布式文件系统)。 2. 编写Python脚本: - 通常需要编写两个脚本,一个用于Map任务,另一个用于Reduce任务。 - Map脚本负责读取输入数据,将输入数据转换成一系列的键值对(key-value pairs)。 - Reduce脚本则对Map阶段输出的中间数据进行排序和汇总,生成最终结果。 3. 使用Hadoop Streaming运行Python脚本: - 在命令行中使用Hadoop Streaming命令提交MapReduce作业。 - 命令格式大致如下: ``` hadoop jar /path/to/hadoop-streaming.jar \ -file /path/to/map.py -mapper /path/to/map.py \ -file /path/to/reduce.py -reducer /path/to/reduce.py \ -input /path/to/input -output /path/to/output ``` - 其中,`-file`参数用于指定Map和Reduce脚本的位置,`-mapper`和`-reducer`用于指定Map和Reduce的脚本,`-input`和`-output`用于指定输入输出的HDFS路径。 4. 监控作业运行:通过Hadoop的Web界面或者命令行工具来监控MapReduce作业的运行状态,确保作业顺利完成。 5. 检查结果:作业完成后,可以在指定的输出路径检查结果文件。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值