【MapReduce】MapReduce清洗共享单车数据

数据

点击下载数据
在这里插入图片描述
所对应的字段分别是:结束时间、车俩id、出发地、目的地、所在城市、开始经度,开始纬度、结束经度,结束维度

  • 需求
    去掉空数据或者NA的
    将时间格式转换成2017年7月1日 00:45
    计算所跨越的经纬度
    按照所在城市将数据进行分类存储,再同一类数据中,按照车俩的id进行升序排序

代码实现

自定义类

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class JavaBean implements WritableComparable<JavaBean> {
    private String startTime;
    private String endTime;
    private int id;
    private String start_loc;
    private String end_loc;
    private String city;
    private double longitude;
    private double latitiude;

    public int compareTo(JavaBean o) {
        return -(o.id - this.id);
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(startTime);
        dataOutput.writeUTF(endTime);
        dataOutput.writeInt(id);
        dataOutput.writeUTF(start_loc);
        dataOutput.writeUTF(end_loc);
        dataOutput.writeUTF(city);
        dataOutput.writeDouble(longitude);
        dataOutput.writeDouble(latitiude);
    }

    public void readFields(DataInput dataInput) throws IOException {
        startTime = dataInput.readUTF();
        endTime = dataInput.readUTF();
        id = dataInput.readInt();
        start_loc = dataInput.readUTF();
        end_loc = dataInput.readUTF();
        city = dataInput.readUTF();
        longitude = dataInput.readDouble();
        latitiude = dataInput.readDouble();
    }

    public void set(String startTime, String endTime, int id, String start_loc, String end_loc, String city, double longitude, double latitiude) {
        this.startTime = startTime;
        this.endTime = endTime;
        this.id = id;
        this.start_loc = start_loc;
        this.end_loc = end_loc;
        this.city = city;
        this.longitude = longitude;
        this.latitiude = latitiude;
    }

    @Override
    public String toString() {
        return startTime + '\t' +
                endTime + '\t' +
                id + "\t" +
                start_loc + '\t' +
                end_loc + '\t' +
                city + '\t' +
                longitude + "\t" +
                latitiude;
    }

    public String getStartTime() {
        return startTime;
    }

    public void setStartTime(String startTime) {
        this.startTime = startTime;
    }

    public String getEndTime() {
        return endTime;
    }

    public void setEndTime(String endTime) {
        this.endTime = endTime;
    }

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getStart_loc() {
        return start_loc;
    }

    public void setStart_loc(String start_loc) {
        this.start_loc = start_loc;
    }

    public String getEnd_loc() {
        return end_loc;
    }

    public void setEnd_loc(String end_loc) {
        this.end_loc = end_loc;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public double getLongitude() {
        return longitude;
    }

    public void setLongitude(double longitude) {
        this.longitude = longitude;
    }

    public double getLatitiude() {
        return latitiude;
    }

    public void setLatitiude(double latitiude) {
        this.latitiude = latitiude;
    }
}

Mapper阶段

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class MapTest extends Mapper<LongWritable, Text, JavaBean, NullWritable> {
    JavaBean k = new JavaBean();
    SimpleDateFormat simpleDateFormat1 = new SimpleDateFormat("MM/dd/yyyy HH:mm");
    SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyy-MM-dd HH:mm");
    Date date1, date2;
    String time1 = null;
    String time2 = null;
    Double longitude, latitiude;

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String datas[] = value.toString().split("\t", -1);
        for (String str : datas) {
            if ("".equals(str) || str == null || "NA".equalsIgnoreCase(str)) return;
        }
        try {
            date1 = simpleDateFormat1.parse(datas[1]);
            time1 = simpleDateFormat2.format(date1);

            date2 = simpleDateFormat1.parse(datas[2]);
            time2 = simpleDateFormat2.format(date2);
        } catch (ParseException e) {
            e.printStackTrace();
        }
        longitude = Double.parseDouble(datas[8]) - Double.parseDouble(datas[7]);
        latitiude = Double.parseDouble(datas[10]) - Double.parseDouble(datas[9]);
        k.set(time1, time2, Integer.parseInt(datas[3]), datas[4], datas[5], datas[6], longitude, latitiude);
        context.write(k, NullWritable.get());
    }
}

自定义outputFormat

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyOutputFormat extends FileOutputFormat<JavaBean, NullWritable> {
    public RecordWriter<JavaBean, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
        return new MyRecordWriter(job);
    }
}

自定义RecordWriter

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;

public class MyRecordWriter extends RecordWriter<JavaBean, NullWritable> {
    BufferedWriter bw;

    public MyRecordWriter(TaskAttemptContext taskAttemptContext) {

    }

    public void write(JavaBean key, NullWritable value) throws IOException, InterruptedException {
        String city = key.getCity();
        String path = "D:\\MP\\共享单车\\output1\\" + city + ".txt";
        bw = new BufferedWriter(new FileWriter(path, true));
        bw.write(key.toString());
        bw.write("\n");
        bw.flush();
    }

    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        bw.close();
    }
}

Driver阶段

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;

public class DriTest {
    public static void main(String[] args) throws Exception {
        java.io.File file = new java.io.File("D:\\MP\\共享单车\\output2");
        if (file.exists()) {
            delFile(file);
            driver();
        } else {
            driver();
        }
    }

    public static void delFile(java.io.File file) {
        File[] files = file.listFiles();
        if (files != null && files.length != 0) {
            for (int i = 0; i < files.length; i++) {
                delFile(files[i]);
            }
        }
        file.delete();
    }

    public static void driver() throws Exception {
        Configuration conf = new Configuration();
//        conf.set("fs.default","hdfs://192.168.0.155:9000");
        Job job = Job.getInstance(conf);

        job.setJarByClass(DriTest.class);
        job.setMapperClass(MapTest.class);

        job.setMapOutputKeyClass(JavaBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputFormatClass(MyOutputFormat.class);

        FileInputFormat.setInputPaths(job, "D:\\MP\\共享单车\\input\\dataResources.txt");
        FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\共享单车\\output2"));
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

结果

分类成功
在这里插入图片描述
id升序
在这里插入图片描述

  • 2
    点赞
  • 46
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值