数据
36.63.116.201|sdk.conf.igexin.com|20170207161935|61.147.218.24;222.186.20.109;222.186.20.123|0
36.63.123.215|cm052.getui.igexin.com|20170207161935|183.131.1.82|0
36.63.132.38|mmbiz.qpic.cn|20170207161935|122.228.72.152;115.231.191.141;122.228.72.165;122.228.72.151;122.228.72.147;115.231.191.143;122.228.72.163;122.228.72.159;115.231.191.144;122.228.56.
157;122.228.72.166;122.228.56.155;122.228.72.164;122.228.56.156;115.231.191.142;122.228.72.148|0117.70.249.121|punch.p2p.qq.com|20170207161935|14.17.43.40|0
114.102.113.19|omgmta.play.t002.ottcn.com|20170207161935|123.151.179.173|0
36.63.40.131|pop.sjk.ijinshan.com|20170207161935|60.169.76.70;61.132.239.147;61.132.239.146|0
36.5.84.35|bird.sns.iqiyi.com|20170207161935|106.38.219.54;106.38.219.34|0
36.4.13.244|tx2.a.yximgs.com|20170207161935|61.191.60.17;61.191.60.16;61.191.60.19;61.191.60.18|0
36.4.151.103|r.vip.qq.com|20170207161935|14.215.138.24|0
223.244.111.107|supportcmsecurity1.ksmobile.com|20170207161935|221.228.204.21;119.147.146.70|0
题目要求
利用MapReduce程序进行简单的过滤清洗,将字段个数不满足5个的数据过滤掉,并且将网站地址中为“www.taobao.com"的标记替换成"ShoppingAction",最后将清洗过滤后的数据全部输出。
代码
package com.mr2;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FenOne {
public static class MyMapper extends Mapper<LongWritable,Text,Text,NullWritable>
{
protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
{
String s = String.valueOf(value);
String[] split = s.split("\\|");
StringBuffer s1 = new StringBuffer();
//过滤掉不满足5个字段的
if(split.length==5)
{
//过滤掉字段为空的
if(split[0].length()>0&&split[1].length()>0&&split[2].length()>0&&split[3].length()>0&&split[4].length()>0)
{
s1.append(split[0]);
for(int i=1;i<5;i++)
{
//替换
if(split[i].equals("www.taobao.com"))
{
s1.append("|ShoppingAction");
}
else
{
s1.append("|"+split[i]);
}
}
context.write(new Text(String.valueOf(s1)), NullWritable.get());
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,FenOne.class.getSimpleName());
job.setJarByClass(FenOne.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}