招聘网站—MapReduce数据清洗

招聘网站—MapReduce数据清洗

第1关:数据清洗

package recruit.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class RecruitMap extends Mapper<LongWritable, Text, Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        /********** Begin **********/
        String[] fields = value.toString().split("\t");
        boolean flag = true;
        if (fields.length == 9) {
            int i = 0;
            for (String field : fields) {
                if ("".equals(field) || field.trim() == null) {
                    flag = false;
                    break;
                }
            }
            if (flag) {
                if (fields[2].contains("k")) {
                    fields[2] = fields[2].replace("k", "");
                } else if (fields[2].contains("K")) {
                    fields[2] = fields[2].replace("K", "");
                }
                String re = "[0-9]+-[0-9]+";
                flag = fields[2].matches(re);
            }
            if (flag) {
                fields[0] = fields[0].toLowerCase();
                fields[8] = fields[8].toLowerCase();
                fields[8] = fields[8].replaceAll("·", "|");
                fields[1] = fields[1].trim().split("·")[0].trim();
                String maxsalary = fields[2].split("-")[0].trim();
                String minsalary = fields[2].split("-")[1].trim();
                Double avgsalary = (Double.parseDouble(maxsalary) + Double.parseDouble(minsalary)) / 2;
                fields[2] = String.format("%.2f", avgsalary);
                String result = "";
                int len = 0;
                for (String field : fields) {
                    len = len + 1;
                    if (len == fields.length) {
                        result = result + field;
                    } else {
                        result = result + field + "\t";
                    }
                }
                context.write(new Text(result), NullWritable.get());
            }
        }
        /********** End **********/
    }
}
package recruit.mapreduce;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class RecruitReduce extends Reducer<Text, NullWritable, NullWritable, Text> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        /********** Begin **********/
        context.write(NullWritable.get(), new Text(key));
        /********** End **********/
    }
}

package recruit.mapreduce;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class RecruitJob {
    public static void main(String[] args) throws Exception {
        /********** Begin **********/
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(RecruitJob.class);
        job.setMapperClass(RecruitMap.class);
        job.setReducerClass(RecruitReduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        String outputpath = "/root/files";
        Path path = new Path(outputpath);
        FileSystem fileSystem = path.getFileSystem(conf);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);
        }
        FileInputFormat.addInputPath(job, new Path("/data/workspace/myshixun/clean/data/advertise.txt"));
        FileOutputFormat.setOutputPath(job, path);
        job.waitForCompletion(true);

        /********** End **********/
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Wa_Automata

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值