面试题:网上的面试题,做出来大家探讨一下,有好的方法大家分享^^
题目:
当前日志采样格式为
a,b,c,d
b,b,f,e
a,a,c,f
请用你最熟悉的语言编写一个mapreduce,并计算第四列每个元素出现的个数
package ms;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* @Title: MS.java
* @Package ms
* @author zfy1355_gmail_com
* @date 2016年2月24日 下午6:37:42
* @version V1.0
*/
public class MS extends Configured implements Tool {
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Map<String,String> dataMap = new HashMap<String,String>();
private IntWritable one = new IntWritable(1);
@Override
protected void setup(
Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
BufferedReader in = null;
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
/* dataMap.put("d", null);
dataMap.put("e", null);
dataMap.put("f", null);*/
try {
for(Path path : paths){
if(path.toString().contains("data")){
in = new BufferedReader(new FileReader(path.toString()));
String data = "";
while((data = in.readLine())!=null){
dataMap.put(data.split(",")[3], null);
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally{
if(in != null){
in.close();
}
}
}
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String[] vs = value.toString().split(",");
for(int i=0;i<vs.length;i++)
if(dataMap.containsKey(vs[i]))
context.write(new Text(vs[i]), one);
}
}
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int count =0;
for(IntWritable c : values){
count++;
}
context.write(key, new IntWritable(count));
}
}
public int run(String[] args) throws Exception {
Path inPath = new Path("hdfs://ns1/user/hadoop/data.txt");
Job job = new Job(getConf(),"data count");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job,inPath);
FileOutputFormat.setOutputPath(job, new Path("hdfs://ns1/user/hadoop/dataout"));
DistributedCache.addCacheFile(inPath.toUri(), job
.getConfiguration());
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) {
try {
int exitCode = ToolRunner.run(new MS(), args);
} catch (Exception e) {
e.printStackTrace();
}
}
}