需求:实现文件拼接,通过文件中的相同的字段,做成key,将整行数据做成value,通过新建的JavaBean实现。
package com.liuliang;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MR {
public static class MapTask extends Mapper<LongWritable, Text, HPBean, NullWritable>{
Map<String,String> map = new HashMap<>();
@Override
protected void setup(Mapper<LongWritable, Text, HPBean, NullWritable>.Context context)
throws IOException, InterruptedException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
FSDataInputStream inputStream = fs.open(new Path("d:/data/phone.txt"));
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
String line;
while((line = br.readLine())!=null) {
//130 1300000 山东 济南 联通 250000 0531 370100
String[] split = line.split("\\s");
//13826544101 http://www.weibo.com/?category=7 20 5000
map.put(split[1], line);
}
}
HPBean bean = new HPBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//130 1300000 山东 济南 联通 250000 0531 370100
//1382654 http://www.weibo.com/?category=7 20 5000
/*private long phone;
private long upFlow;
private long downFlow;
private String privance;
private String city;
private String isp;*/
String[] strings = value.toString().split("\\s");
String seven = strings[0].substring(0, 7);
String[] split = map.get(seven).split("\\s");
bean.set(Long.parseLong(seven), Long.parseLong(strings[2]), Long.parseLong(strings[3]), split[2], split[3],split[4]);
context.write(bean, NullWritable.get());
}
}
/*public static class ReduceTask extends Reducer<HPBean, NullWritable, Text, LongWritable>{
@Override
protected void reduce(HPBean key, Iterable<NullWritable> values,
Reducer<HPBean, NullWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
}
}*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MR.class);
job.setMapperClass(MapTask.class);
//job.setReducerClass(ReduceTask.class);
/*job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);*/
job.setOutputKeyClass(HPBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path("d:/data/http.log"));
FileOutputFormat.setOutputPath(job, new Path("d:/out/http/"));
File file = new File("d:/out/http/");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
boolean completion = job.waitForCompletion(true);
System.out.println(completion?"执行成功":"执行失败");
}
}
注意事项:
1.得分清楚是哪个文件拼接到另一个文件
2.在新创的javabean中,字段名要对应上
3.找不到相同字段的时候,要变换出来
4.将读取文件放在setup中,可以省下reduce做更多的事情