WordCount入门程序
目录
1.过程分析
(1)mapreduce阶段
(2)job运行阶段
在job运行时,需要知道下面这些信息
(1)Job任务执行的类
(2)map运行需要的输入文件路径
(3)reduce运行输出的文件保存路径
(4)job运行的map类和reduce类
(5)Hadoop的集群配置信息
(6)Reduce输出的数据类型
2.创建maven项目
2.1 log4j配置文件
log4j.rootLogger = debug,stdout,D,E
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = E://hadoop/hdfs_logs/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
log4j.appender.E = org.apache.log4j.DailyRollingFileAppender
log4j.appender.E.File =E://hadoop/hdfs_logs/error.log
log4j.appender.E.Append = true
log4j.appender.E.Threshold = ERROR
log4j.appender.E.layout = org.apache.log4j.PatternLayout
log4j.appender.E.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
2.2 pom.xml
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.4</version>
</dependency>
</dependencies>
3.程序开发
3.1 Map阶段程序
package com.mapreduce.wordCount;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/*
* 文件内容如下:
hello word
hello hadoop
hadoop hdfs
yarn mapreduce
* */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text outKey=new Text();
//单词数都为1,初始化时赋值1
private LongWritable outValue=new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//key值为行号,value为该行的内容
//取一行内容
String line=value.toString();
//按空格切割
String[] words = line.split(" ");
//遍历一行的单词
for (String word : words) {
//写入到map阶段输出
/*输出的格式为:
* hello 1
* word 1
* 因此,输出的Key为字母,value为数量(该阶段不合并,单词数都为1)
* */
//设置key的值
outKey.set(word);
//写入到输出
context.write(outKey, outValue);
}
}
}
3.2 Reduce阶段程序
package com.mapreduce.wordCount;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/*
输入:
hello 1
hello 1
word 1
hadoop 1
输出:
hello 2
word 1
hadoop 1
* */
public class WordCountReduce extends Reducer<Text, LongWritable, Text, LongWritable> {
private LongWritable reduceOutValue=new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values,Context context) throws IOException, InterruptedException {
/*
* 分析输入,输出数据。
* map阶段输入的数据,在reduce阶段直接输出
* 在reduce阶段只需要对map阶段的value进行累加
* */
long sum=0;//初始单词数为0
for (LongWritable value : values) {
sum=sum+value.get();
}
reduceOutValue.set(sum);
context.write(key, reduceOutValue);
}
}
3.3 Driver 程序
package com.mapreduce.wordCount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//获取job信息
Configuration config = new Configuration();
Job job = Job.getInstance(config);
//获取jar位置
job.setJarByClass(WordCountDriver.class);
//关联map,reduce
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
//设置输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置输入输出路径
//通过运行参数指定路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交运行job
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
4.运行程序
4.1本地运行
1.环境准备
(1)下载hadoop安装包解压到本地磁盘
(2)配置windows环境变量
HADOOP_HOME:E:\hadoop\hadoop-2.6.4
path:%HADOOP_HOME%\bin
(3)window10系统hadoo2.6.4 hadoop.dll和winutils.exe
如果是window10系统,需要在hadoop bin目录下放入执行文件hadoop.dll和winutils.exe。
将hadoop.dll和winutils.exe放入hadoop2.6.4/bin目录下
hadoop.dll和winutils.exe文件,百度云盘下载地址:
链接:https://pan.baidu.com/s/1Ae6uMSKkXx2Lv4DpUNHbmQ
提取码:xjvc
2.运行程序
(1)配置运行参数
选择Driver类,右击run as -》Run configure,进行下面界面配置运行参数。
(2)运行Driver程序,在配置的输出路径下面生成一个output文件夹,该文件夹下存放mapreduce运行的输出文件。
注意:在执行mapreduce程序之前,output文件不能存在。运行mapreduce程序后会自动生成该文件夹。
4.2 集群运行
(1)使用maven将项目打包为jar文件。
在window下使用管理员身份运行cmd,进入项目所在文件夹路径下。
执行maven打包命令:mvn package
打包完成以后,会在项目的target目录下生成最新打包的jar包。
pom.xml配置插件,并指定main方法类
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<!--指定main方法-->
<mainClass>com.mapreduce.wordCount.WordCountDriver</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
(2)在hadoop根目录下创建文件夹mapreduce_jar用于存放开发的mapreduce程序jar包。
(3)使用ftp工具上传mapreduce程序jar包到mapreduce_jar文件下。
(4)执行下面命令运行jar包。
进行/usr/local/hadoop/bin路径下,执行:
./hadoop jar /usr/local/hadoop/mapreduce_jar/mapreduce-0.0.1-SNAPSHOT.jar /input/wordcount /output