Mavenpom.xml
添加依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.0</version>
</dependency>
单词计数
对一个文件file.txt
进行单词计数
file.txt
(这里我就随便找了个python之禅)
The Zen of Python, by Tim Peters
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
将个文件并用hadoop fs -put
命令将其存入hdfs的根目录(随便哪里都行,我用的根目录)下
编写程序并运行:
package com.sleepyyoung;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCount {
public static class Map extends Mapper<Object, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
context.write(new Text(itr.nextToken()), one);
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* 统计一个文件中的单词数量
* @param hdfsLink hdfs地址 ip:port
* @param inputFile 输入文件
* @param outputDir 结果输出文件夹
* @throws IOException .
* @throws ClassNotFoundException .
* @throws InterruptedException .
*/
public void getWordCount(String hdfsLink, String inputFile, String outputDir) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.default.name", hdfsLink);
Job job = Job.getInstance(conf);
job.setJarByClass(WordCount.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(inputFile));
FileOutputFormat.setOutputPath(job, new Path(outputDir));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
new WordCount().getWordCount("hdfs://192.168.213.128:9000", "/file.txt", "/output/");
}
}
程序运行结果如下(没法长截图…):
[hadoop@centos01 ~]$ hadoop fs -cat /output/*
*right* 1
-- 1
--obvious 1
Although 3
Beautiful 1
Complex 1
Dutch. 1
Errors 1
Explicit 1
Flat 1
If 2
In 1
Namespaces 1
Now 1
Peters 1
Python, 1
Readability 1
Simple 1
Sparse 1
Special 1
The 1
There 1
Tim 1
Unless 1
Zen 1
a 2
ambiguity, 1
and 1
are 1
aren't 1
at 1
bad 1
be 3
beats 1
better 8
break 1
by 1
cases 1
complex. 1
complicated. 1
counts. 1
dense. 1
do 2
easy 1
enough 1
explain, 2
explicitly 1
face 1
first 1
good 1
great 1
guess. 1
hard 1
honking 1
idea 1
idea. 2
implementation 2
implicit. 1
is 10
it 1
it's 1
it. 1
let's 1
may 2
more 1
nested. 1
never 2
never. 1
not 1
now. 1
obvious 1
of 3
often 1
one 2
one-- 1
only 1
pass 1
practicality 1
preferably 1
purity. 1
refuse 1
rules. 1
should 2
silenced. 1
silently. 1
special 1
temptation 1
than 8
that 1
the 5
those! 1
to 5
ugly. 1
unless 1
way 2
you're 1
二次排序
对文件file.txt
按照第一字段进行升序排列,若第一字段相同,则按照第二字段进行降序排列
file.txt
A 3
B 5
C 1
B 6
A 4
C 5
编写程序并运行:
package com.sleepyyoung;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;
/**
* 二次排序
*/
public class SecondOrder {
public static class Map extends Mapper<LongWritable, Text, MyKeyPair, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
String first = itr.nextToken();
String second = itr.nextToken();
MyKeyPair outKey = new MyKeyPair();
outKey.setFirst(first);
outKey.setSecond(Integer.parseInt(second));
IntWritable outValue = new IntWritable();
outValue.set(Integer.parseInt(second));
context.write(outKey, outValue);
}
}
public static class Reduce extends Reducer<MyKeyPair, IntWritable, Text, IntWritable> {
@Override
protected void reduce(MyKeyPair key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
Text outKey = new Text();
for (IntWritable value : values) {
outKey.set(key.getFirst());
context.write(outKey, value);
}
}
}
/**
* 二次排序
*
* @param hdfsLink hdfs地址 ip:port
* @param inputFile 输入文件
* @param outputDir 输出文件夹路径
* @throws IOException .
* @throws ClassNotFoundException .
* @throws InterruptedException .
*/
public void doOrder(String hdfsLink, String inputFile, String outputDir) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.default.name", hdfsLink);
Job job = Job.getInstance(conf, "Second Order");
job.setJarByClass(SecondOrder.class);
job.setMapperClass(Map.class);
job.setPartitionerClass(MyPartitioner.class);
job.setGroupingComparatorClass(MyGroupComparator.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(MyKeyPair.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputFile));
FileOutputFormat.setOutputPath(job, new Path(outputDir));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
new SecondOrder().doOrder("hdfs://192.168.213.128:9000", "/file.txt", "/output/");
}
}
/**
* 自定义组合key类
*/
class MyKeyPair implements WritableComparable<MyKeyPair> {
private String first;
private int second;
public String getFirst() {
return first;
}
public void setFirst(String first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public int compareTo(MyKeyPair o) {
int res = this.first.compareTo(o.getFirst());
if (res != 0) {
return res;
} else {
return -Integer.compare(this.second, o.getSecond());
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
//序列化对象output中的字段
dataOutput.writeUTF(first);
dataOutput.writeInt(second);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
//反序列化对象input中的字段
this.first = dataInput.readUTF();
this.second = dataInput.readInt();
}
}
/**
* 自定义分区类
*/
class MyPartitioner extends Partitioner<MyKeyPair, IntWritable> {
/**
* 实现抽象方法getPartition(),自定义分区字段
*
* @param myKeyPair key
* @param intWritable value
* @param i 分区数量(等于Reduce任务数量)
* @return 分区编号
*/
@Override
public int getPartition(MyKeyPair myKeyPair, IntWritable intWritable, int i) {
return (myKeyPair.getFirst().hashCode() & Integer.MAX_VALUE) % i;
}
}
/**
* 自定义分组类
*/
class MyGroupComparator extends WritableComparator {
@Override
public int compare(WritableComparable a, WritableComparable b) {
return super.compare(a, b);
}
protected MyGroupComparator() {
//指定分组<key,value>对中的key的类型,true为创建该类型的实例
super(MyKeyPair.class, true);
}
@Override
public int compare(Object a, Object b) {
MyKeyPair a1 = (MyKeyPair) a;
MyKeyPair b1 = (MyKeyPair) b;
return a1.getFirst().compareTo(b1.getFirst());
}
}
程序运行后,查看输出结果:
hadoop fs -cat /output/*
合并去重
对input文件夹下的file1.txt
和file2.txt
进行合并去重
file1.txt
2019-3-1 a
2019-3-2 b
2019-3-3 c
2019-3-4 d
2019-3-5 a
2019-3-6 b
2019-3-7 c
2019-3-3 c
file2.txt
2019-3-1 b
2019-3-2 a
2019-3-3 b
2019-3-4 d
2019-3-5 a
2019-3-6 c
2019-3-7 d
2019-3-3 c
新建这两个文件并用hadoop fs -put
命令将其存入hdfs的 /input文件夹下
package com.sleepyyoung;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* 将一个文件夹的数据去重并合并
*/
public class DeDuplication {
static String INPUT = "";
static String OUTPUT = "";
//map()方法将输入中的value复制到输出数据的key上,并直接输出
public static class Map extends Mapper<Object, Text, Text, Text> {
private static Text line = new Text();//每行数据
//重写map()方法
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
line = value;
context.write(line, new Text(""));
}
}
//reduce()方法将输入中的key复制到输出数据的key上,并直接输出
public static class Reduce extends Reducer<Text, Text, Text, Text> {
//重写reduce()方法
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(key, new Text(""));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
INPUT = args[0];
OUTPUT = args[1];
Configuration conf = new Configuration();
//构建任务对象
Job job = Job.getInstance(conf, "Data Deduplication");
job.setJarByClass(DeDuplication.class);
//设置Map、Combine和Reduce处理类
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
//设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(INPUT));
FileOutputFormat.setOutputPath(job, new Path(OUTPUT));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
将该项目打包为jar文件并重命名为deDuplication.jar
,然后上传到hadoop存放jar包的位置
执行以下命令,运行程序:(com.sleepyyoung
是包名,第一个参数/input
是待合并的文件所在的文件夹,第二个参数/output
是合并结果存放的文件夹)
hadoop jar deDuplication.jar com.sleepyyoung.DeDuplication /input /output
查看输出结果:
hadoop fs -cat /output/*
求平均分
对input文件夹下的math.txt
、chinese.txt
和english.txt
,求每个学生的平均分
math.txt
张三 88
李四 99
王五 66
赵六 77
chinese.txt
张三 78
李四 89
王五 96
赵六 86
english.txt
张三 80
李四 90
王五 82
赵六 76
新建这两个文件并用hadoop fs -put
命令将其存入hdfs的 /input文件夹下
package com.sleepyyoung;
import org.apache.commons.text.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
/**
* 求一个文件夹中的所有文件的学生的平均值
*/
public class GetAverageScore {
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将输入的一行数据转化为String
String line = new String(value.getBytes(), 0, value.getLength(), StandardCharsets.UTF_8);
StringTokenizer itr = new StringTokenizer(line);
String strName = itr.nextToken();//学生姓名
String strScore = itr.nextToken();//学生成绩
Text name = new Text(strName);
int scoreInt = Integer.parseInt(strScore);
context.write(name, new IntWritable(scoreInt));
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while (iterator.hasNext()) {
sum += iterator.next().get();//计算总分
count++;//统计总科目数
}
int average = (int) sum / count;//计算平均成绩
//输出姓名和平均成绩
context.write(key, new IntWritable(average));
}
}
/**
* 求一个文件夹中的所有文件的学生的平均值
*
* @param hdfsLink hdfs地址 ip:port
* @param inputDir 输入文件夹,即学生成绩所在文件夹
* @param outputDir 结果输出文件夹
* @throws IOException .
* @throws ClassNotFoundException .
* @throws InterruptedException .
*/
public void getAverage(String hdfsLink, String inputDir, String outputDir) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.default.name", hdfsLink);
Job job = Job.getInstance(conf, "Sorce Average");
job.setJarByClass(GetAverageScore.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputDir));
FileOutputFormat.setOutputPath(job, new Path(outputDir));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
new GetAverageScore().getAverage("hdfs://192.168.213.128:9000", "/input/", "/output/");
}
}
程序运行后,查看输出结果:
hadoop fs -cat /output/*