案例一
student A 06140412 05 08 102 110 106
student B 06140407 02 06 60 98 80
student C 06140404 10 07 98 31 63
student D 06140403 07 10 105 109 107
student E 06140406 03 03 57 87 92
student F 06140408 10 06 102 102 50
案例二
student G 06140402 03 07 54 61 64
student H 06140401 05 03 83 76 111
student I 06140409 05 10 70 56 91
student J 06140411 07 09 22 119 112
student K 06140410 02 01 45 65 80
student L 06140405 03 02 79 20 26
需求:将多个文件合并为SequenceFile(存储多个小文件)
存储方式:文件路径+文件内容
FileInput类
package com.FileInputFormat.util;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 将多个文件合并为SequenceFile(存储多个小文件) 存储方式:文件路径+文件内容
*/
public class FileInput extends FileInputFormat<NullWritable, BytesWritable> {
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
RecordReader<NullWritable, BytesWritable> fileRecordReader = new FileRecordReader();
return fileRecordReader;
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
//不切原来的文件
return false;
}
}
FileRecordReader类
package com.FileInputFormat.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 将多个文件合并为SequenceFile(存储多个小文件) 存储方式:文件路径+文件内容
*/
public class FileRecordReader extends RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit = null;
private Configuration conf = null;
private boolean flag = false;
private FSDataInputStream fsDataInputStream = null;
private FileSystem fileSystem = null;
private BytesWritable value = new BytesWritable();
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//初始化文件切片
fileSplit = (FileSplit) inputSplit;
//初始化配置
conf = taskAttemptContext.getConfiguration();
}
@Override
public boolean nextKeyValue() {
if (!flag){
//根据切片的长度创建缓冲区
byte[] buf = new byte[(int) fileSplit.getLength()];
//获取路径
Path path = fileSplit.getPath();
try {
//根据路径获取文件系统
fileSystem = path.getFileSystem(conf);
//获取输入流
fsDataInputStream = fileSystem.open(path);
//数据拷贝
IOUtils.readFully(fsDataInputStream,buf,0,buf.length);
//拷贝缓冲到最终的输出
value.set(buf,0,buf.length);
} catch (IOException e) {
e.printStackTrace();
}finally {
//关闭流
IOUtils.closeStream(fsDataInputStream);
IOUtils.closeStream(fileSystem);
}
flag = true;
return true;
}
return false;
}
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return null;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
}
}
FileInputMapper类
package com.FileInputFormat.util;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 将多个文件合并为SequenceFile(存储多个小文件) 存储方式:文件路径+文件内容
*/
public class FileInputMapper extends Mapper<NullWritable, BytesWritable, Text,BytesWritable> {
private FileSplit fileSplit = null;
private Path path = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取切片信息
fileSplit = (FileSplit)context.getInputSplit();
//获取路径
path = fileSplit.getPath();
}
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
//输出到Reducer端
context.write(new Text(path.toString()),value);
}
}
FileInpueReducer类
package com.FileInputFormat.util;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 将多个文件合并为SequenceFile(存储多个小文件) 存储方式:文件路径+文件内容
*/
public class FileInpueReducer extends Reducer<Text, BytesWritable,Text, BytesWritable> {
@Override
protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException {
for (BytesWritable bytes: values){
context.write(key,bytes);
}
}
}
FileInputDriver类
package com.FileInputFormat.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 将多个文件合并为SequenceFile(存储多个小文件) 存储方式:文件路径+文件内容
*/
public class FileInputDriver {
public static void main(String[] args) {
//实例化Configuration
Configuration conf = new Configuration();
try {
//创建任务
Job job = Job.getInstance(conf);
//指定jar包的位置
job.setJarByClass(FileInputDriver.class);
//关联Mapper类
job.setMapperClass(FileInputMapper.class);
//关联Reducer类
job.setReducerClass(FileInpueReducer.class);
//设置Mapper输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
//设置Reducer输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
//设置自定义读取的方式
job.setInputFormatClass(FileInput.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
//设置输入路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//提交任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}
案例
student A 06140412 05 08 102 110 106
student B 06140407 02 06 60 98 80
student C 06140404 10 07 98 31 63
student D 06140403 07 10 105 109 107
student E 06140406 03 03 57 87 92
student F 06140408 10 06 102 102 50
student G 06140402 03 07 54 61 64
student H 06140401 05 03 83 76 111
student I 06140409 05 10 70 56 91
student J 06140411 07 09 22 119 112
student K 06140410 02 01 45 65 80
student L 06140405 03 02 79 20 26
需求:过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
FileOutput类
package com.FileOutputFormat.util;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
*/
public class FileOutput extends FileOutputFormat<StudentGrade, NullWritable> {
@Override
public RecordWriter<StudentGrade, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
RecordWriter<StudentGrade, NullWritable> fileRecordWriter = new FileRecordWriter();
return fileRecordWriter;
}
}
FileRecordWriter类
package com.FileOutputFormat.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
*/
public class FileRecordWriter extends RecordWriter<StudentGrade, NullWritable> {
private FSDataOutputStream fsDataOutputStream;
public FileRecordWriter() {
//实例化Configuration
Configuration conf = new Configuration();
try {
//获取文件系统
FileSystem fileSystem = FileSystem.get(conf);
//获取输出流
fsDataOutputStream = fileSystem.create(new Path("F:\\IdeaProjects\\out6\\out.log"));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void write(StudentGrade studentGrade, NullWritable nullWritable) throws IOException, InterruptedException {
//转换成String类型
String str = studentGrade.toString();
//分割数据
String[] field = str.split("\t");
//如果5号考场的学生,则输出
if ("5".equals(field[2])) {
fsDataOutputStream.write(str.getBytes());
fsDataOutputStream.write("\n".getBytes());
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//关闭流
if (fsDataOutputStream != null){
fsDataOutputStream.close();
}
}
}
StudentGrade类
package com.FileOutputFormat.util;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
*/
public class StudentGrade implements WritableComparable<StudentGrade> {
//学生姓名
private String name;
//学号
private String id;
//考场号
private int roomID;
//语文成绩
private int chinese;
//数学成绩
private int math;
//英语成绩
private int english;
//总成绩
private int sum;
//无参构造方法
public StudentGrade(){}
//有参构造方法
public StudentGrade(String name, String id, int roomID, int chinese, int math, int english){
this.name = name;
this.id = id;
this.roomID = roomID;
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public int getRoomID() {
return roomID;
}
public void setRoomID(int roomID) {
this.roomID = roomID;
}
public int getChinese() {
return chinese;
}
public void setChinese(int chinese) {
this.chinese = chinese;
}
public int getMath() {
return math;
}
public void setMath(int math) {
this.math = math;
}
public int getEnglish() {
return english;
}
public void setEnglish(int english) {
this.english = english;
}
public int getSum() {
return sum;
}
public void setSum(int sum) {
this.sum = sum;
}
@Override
public int compareTo(StudentGrade object) {
return Integer.parseInt(this.id) > Integer.parseInt(object.id)? 1:-1;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(name);
dataOutput.writeUTF(id);
dataOutput.writeInt(roomID);
dataOutput.writeInt(chinese);
dataOutput.writeInt(math);
dataOutput.writeInt(english);
dataOutput.writeInt(sum);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
name = dataInput.readUTF();
id = dataInput.readUTF();
roomID = dataInput.readInt();
chinese = dataInput.readInt();
math = dataInput.readInt();
english = dataInput.readInt();
sum = dataInput.readInt();
}
@Override
public String toString() {
return name + "\t" + id + "\t" + roomID + "\t" + chinese + "\t" + math + "\t" + english + "\t" + sum;
}
}
FileOutputMapper类
package com.FileOutputFormat.util;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
*/
public class FileOutputMapper extends Mapper<LongWritable, Text,StudentGrade, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取行数据
String line = value.toString();
//分割数据
String[] field = line.split("\t");
//学生姓名
String name = field[0];
//学号
String id = field[1];
//考场号
int roomID = Integer.valueOf(field[2]);
//语文成绩
int chinese = Integer.valueOf(field[4]);
//数学成绩
int math = Integer.valueOf(field[5]);
//英语成绩
int english = Integer.valueOf(field[6]);
//写到Reducer端
context.write(new StudentGrade(name,id,roomID,chinese,math,english),NullWritable.get());
}
}
FileOutputReducer类
package com.FileOutputFormat.util;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
*/
public class FileOutputReducer extends Reducer<StudentGrade, NullWritable,StudentGrade,NullWritable> {
@Override
protected void reduce(StudentGrade key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//写到Driver端
context.write(key,values.iterator().next());
}
}
FileOutputDriver类
package com.FileOutputFormat.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 过滤数据,把5号考场的学生放到一个文件中,并计算每个学生的总成绩
*/
public class FileOutputDriver {
public static void main(String[] args) {
try {
//实力化Configuration对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定jar包的位置
job.setJarByClass(FileOutputDriver.class);
//关联Mapper类
job.setMapperClass(FileOutputMapper.class);
//关联Reducer类
job.setReducerClass(FileOutputReducer.class);
//设置Mapper输出的数据类型
job.setMapOutputKeyClass(StudentGrade.class);
job.setMapOutputValueClass(NullWritable.class);
//设置Reducer输出的数据类型
job.setOutputKeyClass(StudentGrade.class);
job.setOutputValueClass(NullWritable.class);
//设置自定义输出的类型
job.setOutputFormatClass(FileOutput.class);
//设置输入的路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//设置输出的路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//执行任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}