3、在src目录下的main目录添加resources目录
直接点击resources
4、在resources目录下新建一个log4j.properties文件,并将以下配置信息写入到文件中
log4j.rootLogger=INFO, stdout, D
# Console Appender
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern= %d{hh:mm:ss,SSS} [%t] %-5p %c %x - %m%n
# Custom tweaks
log4j.logger.com.codahale.metrics=WARN
log4j.logger.com.ryantenney=WARN
log4j.logger.com.zaxxer=WARN
log4j.logger.org.apache=WARN
log4j.logger.org.hibernate=WARN
log4j.logger.org.hibernate.engine.internal=WARN
log4j.logger.org.hibernate.validator=WARN
log4j.logger.org.springframework=WARN
log4j.logger.org.springframework.web=WARN
log4j.logger.org.springframework.security=WARN
# log file
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = D://log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss} [ %t:%r ] - [ %p ] %m%n
5、添加maven依赖,打开子工程的pom文件,在处添加依赖信息
<!--单元测试依赖-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<!--Hadoop 依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<!--HDFS-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<!--hadoop常用组件支持-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<!--日志依赖-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
6、编写代码
在main目录下创建一个package来存放我们java代码,具体结构如下图
WordCountMapper.java
package com.dtinone.hadooptest1.mapper;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//super.map(key, value, context);
//获取读取到的每一行数据
String line = value.toString();
//通过空格进行分割
String[] words = line.split(" ");
//将初步处理的结果进行输出
for (String word: words) {
context.write(new Text(word), new IntWritable(1));
}
}
}
WordCountReducer.java
package com.dtinone.hadooptest1.reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable, Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//super.reduce(key, values, context);
Integer count = 0;
for(IntWritable value: values){
//将每一个分组中的数据进行累加计数
count += value.get();
}
//将最终结果进行输出
context.write(key, new IntWritable(count));
}
}
WordCountMaster.java
package com.dtinone.hadooptest1.master;
import com.dtinone.hadooptest1.mapper.WordCountMapper;
import com.dtinone.hadooptest1.reducer.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountMaster {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//初始化配置
System.setProperty("hadoop.home.dir", "E:\\developApps\\hadoop-2.7.3");
//此处写自己的集群namenode所在主机节点的IP
String Hadoop_Url = "hdfs://192.168.180.88:9000/";
Configuration conf = new Configuration();
//初始化job参数,指定job名称
Job job = Job.getInstance(conf, "WordCount");
//设置运行job的类
job.setJarByClass(WordCountMaster.class);
//设置Mapper类
job.setMapperClass(WordCountMapper.class);
//设置Reducer类
job.setReducerClass(WordCountReducer.class);
//设置Map的输出数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置Reducer的输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置输出的路径
FileInputFormat.setInputPaths(job, new Path(Hadoop_Url + "input/wordCount"));
//设置输出的路径
FileOutputFormat.setOutputPath(job, new Path(Hadoop_Url + "output/wordCount/1"));
//提交job
boolean result = job.waitForCompletion(true);
//执行成功后进行后续操作
if (result) {
System.out.println("单词计数任务已完成!");
}
}
}
注意:在执行程序之前我们需要创建好“input/wordCount目录”并在其中创建好要计数的文件
此操作在虚拟机的namenode节点上操作
#先进入到hadoop的sbin目录
cd /opt/apps/hadoop-2.7.3/sbin
#启动集群
start-dfs.sh
#创建input/wordCount目录
hadoop fs -mkdir -p input/wordCount
#上传文件到该目录,我们的文件在hadoop安装目录的根目录下的data下
cd ../data
hadoop fs -put word.txt /input/wordCount
7、执行程序
8、验证结果。
我们打开web页面的hdfs查看计数结果是否存在
在虚拟机hadoop集群的namenode节点上查看具体计数结果