MapReduce
MR : 编程模型。
WordCountMR
1.编写Mapper
package com.hadoop.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* WordCountMapper
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text,IntWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] arr = line.split(" ");
Text keyOut = new Text();
IntWritable valueOut = new IntWritable(1);
for(String word : arr){
keyOut.set(word);
context.write(keyOut,valueOut);
}
}
}
2.编写Reducer
package com.hadoop.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* WordCountReducer
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0 ;
for(IntWritable iw : values){
count = count + iw.get() ;
}
context.write(key,new IntWritable(count));
}
}
3.编写App
package com.it18zhang.hadoop.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
*/
public class App {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJobName("WordCount");
job.setJarByClass(App.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置mapreduce输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(2);
job.waitForCompletion(true) ;
}
}
4.运行
5.导出jar到hadoop集群上运行
5.1)导出jar
5.2)部署到centos
5.3)启动yarn集群
start-yarn.sh
5.4)查看yarn webui
http://s100:8088/
5.5)准备数据
5.6)执行job
hadoop jar my-hadoop-day04.jar com.it18zhang.hadoop.mr.App /user/centos/1.txt /user/centos/out
Combiner
是map端的reduce,预聚合。减少网络流量。对map的每个分区进行聚合。
Mapper
run(){
setup();
while(...){
//...
map();
}
cleanup();
}
Reducer
run(){
setup();
while(...){
//
reduce(key,Iteratable<IntWritable> it ...);
}
cleanup();
}
file:/tmp/hadoop-Administrator/mapred/staging/Administrator897294152/.staging
splitSize
minSplitSize maxSplitSize blockSize
min = 7
max = 7
block = 32M
splitsize = 7
24 / 7 =
hello world
h#ello world
考察切片的计算法则
1.修改切片的min max
job.getConfiguration().set("mapreduce.input.fileinputformat.split.minsize","14");
job.getConfiguration().set("mapreduce.input.fileinputformat.split.maxsize","14");
// FileInputFormat.setMinInputSplitSize(job,7);
// FileInputFormat.setMaxInputSplitSize(job,7);
MultipleInputs
多个输入。
DBInputFormat
count //总的记录数,100
int chunks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1); //map个数,3
count / chunks //每个切片的记录数。 33
1.实现DBWritable类
package com.hadoop.mr.input.db;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
/**
*MyDBWritable
*/
public class MyDBWritable implements DBWritable,Writable{
public int id ;
public String orderno ;
public float price ;
public int cid ;
public void write(PreparedStatement statement) throws SQLException {
}
public void readFields(ResultSet rs) throws SQLException {
this.id = rs.getInt("id") ;
this.orderno = rs.getString("orderno") ;
this.price = rs.getFloat("price") ;
this.cid = rs.getInt("cid") ;
}
//串行
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(orderno);
out.writeFloat(price);
out.writeInt(cid);
}
//反串行
public void readFields(DataInput in) throws IOException {
this.id = in.readInt() ;
this.orderno = in.readUTF();
this.price = in.readFloat() ;
this.cid = in.readInt() ;
}
}
2.自定义DBInputFormat
package com.hadoop.mr.input.db;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
/**
* Created by Administrator on 2017/8/17.
*/
public class MyDBInputFormat extends DBInputFormat<MyDBWritable>{
public void setConf(Configuration conf) {
super.setConf(conf);
}
public List<InputSplit> getSplits(JobContext job) throws IOException {
ResultSet results = null;
Statement statement = null;
try {
statement = connection.createStatement();
results = statement.executeQuery(getCountQuery());
results.next();
long count = results.getLong(1);
int chunks = 3; //job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
long chunkSize = (count / chunks);
results.close();
statement.close();
List<InputSplit> splits = new ArrayList<InputSplit>();
// Split the rows into n-number of chunks and adjust the last chunk
// accordingly
for (int i = 0; i < chunks; i++) {
DBInputSplit split;
if ((i + 1) == chunks)
split = new DBInputSplit(i * chunkSize, count);
else
split = new DBInputSplit(i * chunkSize, (i * chunkSize) + chunkSize);
splits.add(split);
}
connection.commit();
return splits;
} catch (SQLException e) {
throw new IOException("Got SQLException", e);
} finally {
try {
if (results != null) {
results.close();
}
} catch (SQLException e1) {
}
try {
if (statement != null) {
statement.close();
}
} catch (SQLException e1) {
}
closeConnection();
}
}
}
输出格式
1.TextOutputFormat
默认格式
2.SequenceFileOutputFormat
job.setOutputFormatClass(SequenceFileOutputFormat.class);
MR特性
1.计数器
debug-tool远程调试类。
2.计数器限制
[mapred-site.xml]
mapreduce.job.counters.limit=120
3.计数器名称长度限制(64,代码中硬性约束)
192.168.11.113:13932:pool-3-thread-1:DBReduce@546740333:reduce()