Hadoop编程
实例
/opt/sxt/hadoop-2.6.5/share/hadoop/mapreduce/
jar包
- hadoop-mapreduce-examples-2.6.5.ja
准备
- for i in
seq 100000
;do echo “hello sxt $i” >> test.txt;done - hdfs dfs -mkdir -p /user/root
- hdfs dfs -ls -R /
- hdfs dfs -D dfs.blocksize=1048576 -put ./test.txt /user/root
命令
-
hadoop jar hadoop-mapreduce-examples-2.6.5.jar wordcount /input /output
- wordcount为主程序
- *input:是hdfs文件系统中数据所在的目录
- *ouput:是hdfs中不存在的目录,mr程序运行的结果会输出到该目录(输出路径不允许放东西)
讲解
-
以下是输出目录的内容:
-
-rw-r–r-- 3 root supergroup 0 2017-07-02 02:49 /mr/test/output/_SUCCESS/
- /_SUCCESS:是信号/标志文件
-
-rw-r–r-- 3 root supergroup 49 2017-07-02 02:49 /mr/test/output/part-r-00000
- /part-r-00000:是reduce输出的数据文件,r:reduce的意思,00000是对应的reduce
-
多个reduce会有多个数据文件
WordCount案例
启动
- zkServer.sh start
- start.dfs.sh
- yarn-daemon.sh start resourcemanager
- start-yarn.sh
WordCount
-
MyWC
- package com.sxt.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyWC {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//Create a new job
Job job = Job.getInstance(conf);
//要打成jar包的入口函数
job.setJarByClass(MyWC.class);
//Specify various job-specific parameters
//定义job名称
job.setJobName("myjob");
//定义输入路径
Path inPath = new Path("/user/root/test.txt");
FileInputFormat.addInputPath(job, inPath);
//定义输出路径(不允许放东西)
Path outPath = new Path("/output/wordcount");
//有则删除
if (outPath.getFileSystem(conf).exists(outPath)) {
outPath.getFileSystem(conf).delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setReducerClass(MyReducer.class);
//Submit the job,then poll for progress until the job is complete
//提交job作业
job.waitForCompletion(true);
}
}
-
MyMapper
- package com.sxt.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString()); //hello **
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
-
MyReducer
- package com.sxt.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
//迭代计算
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
源码分析
Mapreduce案例
案例一
-
MyTQ
- package com.bjsxt.tq;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
*1949-10-01 14:21:02 34c
*
-
*/
public class MyTQ {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {//1.配置 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(MyTQ.class); job.setJobName("tq"); //2.设置输入输出路径 Path inPath = new Path("/tq/input"); FileInputFormat.addInputPath(job, inPath); Path outPath = new Path("/tq/output"); if (outPath.getFileSystem(conf).exists(outPath)) { outPath.getFileSystem(conf).delete(outPath, true); } FileOutputFormat.setOutputPath(job, outPath); //3.设置Mapper job.setMapperClass(Tmapper.class);//自定义传输key job.setMapOutputKeyClass(Tq.class); job.setOutputValueClass(IntWritable.class); //4.自定义排序比较器 job.setSortComparatorClass(TSortComparator.class); //5.自定义分区器 job.setPartitionerClass(TPartitoner.class); //6. 自定义组排序器 job.setGroupingComparatorClass(TGroupComparator.class); //7.设置reducetask数量 job.setNumReduceTasks(2); //8.设置reducer job.setReducerClass(Treducer.class); //9.打印过程 job.waitForCompletion(true);
}
}
-
TGroupComparator
- package com.bjsxt.tq;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TGroupComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;
public TGroupComparator() {
super(Tq.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
t1 = (Tq) a;
t2 = (Tq) b;
int c1 = Integer.compare(t1.getYear(), t2.getYear());
if (c1 == 0) {
return Integer.compare(t1.getMonth(), t2.getMonth());
}
return c1;
}
}
-
Tmapper
- package com.bjsxt.tq;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
public class Tmapper extends Mapper<LongWritable, Text, Tq, IntWritable> {
Tq tkey = new Tq();
IntWritable tval = new IntWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//获得时间、温度数组
String[] words = StringUtils.split(value.toString(), '\t');
String pattern = "yyyy-MM-dd";
SimpleDateFormat sdf = new SimpleDateFormat(pattern);
try {
Date date = sdf.parse(words[0]);
Calendar cal = Calendar.getInstance();
cal.setTime(date);
tkey.setYear(cal.get(Calendar.YEAR));
tkey.setMonth(cal.get(Calendar.MONTH) + 1);
tkey.setDay(cal.get(Calendar.DAY_OF_MONTH));
int wd = Integer.parseInt(words[1].substring(0, words[1].lastIndexOf("c")));
tkey.setWd(wd);
tval.set(wd);
context.write(tkey, tval);
} catch (ParseException e) {
e.printStackTrace();
}
}
}
-
TPartitoner
- package com.bjsxt.tq;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class TPartitoner extends Partitioner<Tq, IntWritable> {
@Override
public int getPartition(Tq key, IntWritable value, int i) {
return key.getYear() % i;
}
}
-
Tq
- package com.bjsxt.tq;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Tq implements WritableComparable {
private int year;
private int month;
private int day;
private int wd;
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public int getMonth() {
return month;
}
public void setMonth(int month) {
this.month = month;
}
public int getDay() {
return day;
}
public void setDay(int day) {
this.day = day;
}
public int getWd() {
return wd;
}
public void setWd(int wd) {
this.wd = wd;
}
@Override
public String toString() {
return year + "-" + month + "-" + day;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.getYear());
dataOutput.writeInt(this.getMonth());
dataOutput.writeInt(this.getDay());
dataOutput.writeInt(this.getWd());
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.setYear(dataInput.readInt());
this.setMonth(dataInput.readInt());
this.setDay(dataInput.readInt());
this.setWd(dataInput.readInt());
}
@Override
public int compareTo(Tq o) {
int c1 = Integer.compare(this.getYear(), o.getYear());
if (c1 == 0) {
int c2 = Integer.compare(this.getMonth(), o.getMonth());
if (c2 == 0) {
return Integer.compare(this.getDay(), o.getDay());
}
return c2;
}
return c1;
}
}
-
Treducer
- package com.bjsxt.tq;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
- 1949-10-01 34
- 1949-10-02 34
- 1949-10-03 34
- 1949-10-05 34
*/
public class Treducer extends Reducer<Tq, IntWritable, Text, IntWritable> {
Text tkey = new Text();
IntWritable tval = new IntWritable();
@Override
protected void reduce(Tq key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable val : values) {
int flag = 0;
int day = 0;
if (flag == 0) {
tkey.set(key.toString());
tval.set(val.get());
context.write(tkey, tval);
flag++;
day = key.getDay();
}
if (flag != 0 && day != key.getDay()) {
tkey.set(key.toString());
tval.set(val.get());
context.write(tkey, tval);
// break;
return;
}
}
}
}
-
TSortComparator
- package com.bjsxt.tq;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/*
*
- 实现天气年月正序,温度倒序
- */
public class TSortComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;
public TSortComparator() {
super(Tq.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
t1 = (Tq) a;
t2 = (Tq) b;
int c1 = Integer.compare(t1.getYear(), t2.getYear());
if (c1 == 0) {
int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
if (c2 == 0) {
return Integer.compare(t1.getWd(), t2.getWd());
}
return c2;
}
return c1;
}
}
案例二
-
列表差集
-
思路
-
MyFD
- package com.bjsxt.fd;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyFD {
public static void main(String[]