Hadoop作业
内容:词频统计
一.准备
1.需用软件:有Java语言编译器(eclipse 或IDEA)
2.前提:虚拟机上已经安装完成Hadoop,且已安装完成mapreduce和hdfs这两个组件。
3.新建一个文本文档hbb.txt
二.制作一个包jar
1.打开eclipse,写入Java代码
package hdfs;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class word {
public static class TokenizerMapper extends
Mapper<Object, Text, Text, IntWritable> {
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
System.out.println(key);
Text keyOut;
IntWritable valueOut = new IntWritable(1);
StringTokenizer token = new StringTokenizer(value.toString());
while (token.hasMoreTokens()) {
keyOut = new Text(token.nextToken());
context.write(keyOut, valueOut);
}
}
}
public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
private static class IntWritableDecreaseingComparator extends
IntWritable.Comparator {
@Override
public int compare(WritableComparable a, WritableComparable b) {
return -super.compare(a, b);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path tempDir = new Path("hdfs://hadoop0:9000/output2/word1");
try{
Job job = new Job(conf, "word count ");
job.setJarByClass(wordhigh.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setNumReduceTasks(2);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop/0:9000/input2"));
FileOutputFormat.setOutputPath(job, tempDir);
if (job.waitForCompletion(true)){
Job sortJob =new Job(conf,"sort");
sortJob.setJarByClass(wordhigh.class);
FileInputFormat.addInputPath(sortJob,tempDir);
sortJob.setInputFormatClass(SequenceFileInputFormat.class);
sortJob.setMapperClass(InverseMapper.class);
sortJob.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(sortJob,new Path("hdfs://hadoop0:9000/output2/word2"));
sortJob.setOutputKeyClass(IntWritable.class);
sortJob.setOutputValueClass(Text.class);
sortJob.setOutputFormatClass(TextOutputFormat.class);
sortJob.setSortComparatorClass(IntWritableDecreaseingComparator.class);
if (sortJob.waitForCompletion(true)){
System.out.println("ok");
}
}
}
catch (Exception ex){
ex.printStackTrace();
}
}
}
将Java代码在eclipse中运行,运行之后导出包(word.jar),在运行和导出包时需要一些其他的包
三.打开虚拟机,xftp和xshell
在hadoop上运行
1.启动hadoop
start-dfs.sh
start-yarn.sh
2.建立一个文件夹
hdfs dfs -mkdir /word
3.将之前建好的文本文档用xftp导入,再将这个文本文件导入到这个文件夹中
hadoop fs -put /usr/hadoop/hbb.txt /word
4.运行
hadoop jar word.jar /data/hbb.txt /output2
四.查看运行结果
hadoop fs -cat /output2/part-r-00000