这个程序包含四个类:
MyWritable.java
自定义mywritable 把(张三 李四)作为一个writable读入。
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class MyWritable implements WritableComparable<MyWritable> {
private Text first;
private Text second;
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
public Text getFirst() {
return first;
}
public void setFirst(Text first) {
this.first = first;
}
public Text getSecond() {
return second;
}
public void setSecond(Text second) {
this.second = second;
}
public void write(DataOutput arg0) throws IOException {
first.write(arg0);
second.write(arg0);
}
public int compareTo(MyWritable tp) {
int cmp = first.compareTo(tp.first);
if(cmp != 0){
return cmp;
}
return second.compareTo(tp.second);
}
public String toString(){
return first+"\t"+second;
}
public boolean equals(MyWritable my, MyWritable my2){
if(my.first.equals(my2.first)&&my.second.equals(my2.second)){
return true;
}
else{
return false;
}
}
public int hashCode(){
return first.hashCode() * 163 + second.hashCode();
}
}
MyWritableTest.java
主要运行类
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyWritableTest {
//map将输入中的value化成IntWritable类型,作为输出的key
public static class WMap extends Mapper<Text, MyWritable, Text, Text>{
//实现map函数
public void map(Text key,MyWritable value, Context context)
throws IOException,InterruptedException{
context.write(value.getFirst(), value.getSecond());
}
}
public static class WReduce extends
Reducer<Text, Text, Text, Text>{
//实现reduce函数
public void reduce(Text key,Iterable<Text> values,Context context)
throws IOException,InterruptedException{
for(Text val: values){
if(val != null)
context.write(key, val);
}
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = new Job(conf, "MyWritabelTest");
job.setJarByClass(MyWritableTest.class);
//设置Map和Reduce处理类
job.setMapperClass(WMap.class);
job.setReducerClass(WReduce.class);
job.setInputFormatClass(WInputFormat.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
WInputFormat.java
自定义inputformat
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/**
* Used to read Mbox files
* @author Srinath Perera (hemapani@apache.org)
*/
public class WInputFormat extends FileInputFormat<Text, MyWritable>{
private WRecordReader wrecord = null;
@Override
public RecordReader<Text, MyWritable> createRecordReader(
InputSplit inputSplit, TaskAttemptContext attempt) throws IOException,
InterruptedException {
wrecord = new WRecordReader();
wrecord.initialize(inputSplit, attempt);
return wrecord;
}
WRecordReader.java
自定义recordreader里实现了自定义writable的读入。
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
* Parse each mail line by line from MBox stream
* @author Srinath Perera (hemapani@apache.org)
*/
public class WRecordReader extends RecordReader<Text, MyWritable> {
//private static Pattern pattern1 = Pattern.compile("http");
private BufferedReader reader;
private int count = 0;
private Text key;
private MyWritable value = new MyWritable();
public WRecordReader() {
}
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext attempt) throws IOException, InterruptedException {
Path path = ((FileSplit) inputSplit).getPath();
//System.out.println(path);
FileSystem fs = FileSystem.get(URI.create(path.toString()),attempt.getConfiguration());
FSDataInputStream fsStream = fs.open(path);
reader = new BufferedReader(new InputStreamReader(fsStream));
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
String line = reader.readLine();
System.out.println(line);
if(line != null){
count++;
StringTokenizer tokenzer = new StringTokenizer(line, " ");
String[] s = line.split("\\s+");
System.out.println(tokenzer.nextElement());
System.out.println(s[0]);
if(s != null){
value.setFirst(new Text(s[0]));
value.setSecond(new Text(s[1]));
key = new Text("count");
return true;
}
}
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public MyWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return count;
}
@Override
public void close() throws IOException {
reader.close();
}
}