MapReduce清洗共享单车数据
数据
点击下载数据
所对应的字段分别是:结束时间、车俩id、出发地、目的地、所在城市、开始经度,开始纬度、结束经度,结束维度
- 需求
去掉空数据或者NA的
将时间格式转换成2017年7月1日 00:45
计算所跨越的经纬度
按照所在城市将数据进行分类存储,再同一类数据中,按照车俩的id进行升序排序
代码实现
自定义类
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class JavaBean implements WritableComparable<JavaBean> {
private String startTime;
private String endTime;
private int id;
private String start_loc;
private String end_loc;
private String city;
private double longitude;
private double latitiude;
public int compareTo(JavaBean o) {
return -(o.id - this.id);
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(startTime);
dataOutput.writeUTF(endTime);
dataOutput.writeInt(id);
dataOutput.writeUTF(start_loc);
dataOutput.writeUTF(end_loc);
dataOutput.writeUTF(city);
dataOutput.writeDouble(longitude);
dataOutput.writeDouble(latitiude);
}
public void readFields(DataInput dataInput) throws IOException {
startTime = dataInput.readUTF();
endTime = dataInput.readUTF();
id = dataInput.readInt();
start_loc = dataInput.readUTF();
end_loc = dataInput.readUTF();
city = dataInput.readUTF();
longitude = dataInput.readDouble();
latitiude = dataInput.readDouble();
}
public void set(String startTime, String endTime, int id, String start_loc, String end_loc, String city, double longitude, double latitiude) {
this.startTime = startTime;
this.endTime = endTime;
this.id = id;
this.start_loc = start_loc;
this.end_loc = end_loc;
this.city = city;
this.longitude = longitude;
this.latitiude = latitiude;
}
@Override
public String toString() {
return startTime + '\t' +
endTime + '\t' +
id + "\t" +
start_loc + '\t' +
end_loc + '\t' +
city + '\t' +
longitude + "\t" +
latitiude;
}
public String getStartTime() {
return startTime;
}
public void setStartTime(String startTime) {
this.startTime = startTime;
}
public String getEndTime() {
return endTime;
}
public void setEndTime(String endTime) {
this.endTime = endTime;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getStart_loc() {
return start_loc;
}
public void setStart_loc(String start_loc) {
this.start_loc = start_loc;
}
public String getEnd_loc() {
return end_loc;
}
public void setEnd_loc(String end_loc) {
this.end_loc = end_loc;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public double getLongitude() {
return longitude;
}
public void setLongitude(double longitude) {
this.longitude = longitude;
}
public double getLatitiude() {
return latitiude;
}
public void setLatitiude(double latitiude) {
this.latitiude = latitiude;
}
}
Mapper阶段
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class MapTest extends Mapper<LongWritable, Text, JavaBean, NullWritable> {
JavaBean k = new JavaBean();
SimpleDateFormat simpleDateFormat1 = new SimpleDateFormat("MM/dd/yyyy HH:mm");
SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyy-MM-dd HH:mm");
Date date1, date2;
String time1 = null;
String time2 = null;
Double longitude, latitiude;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String datas[] = value.toString().split("\t", -1);
for (String str : datas) {
if ("".equals(str) || str == null || "NA".equalsIgnoreCase(str)) return;
}
try {
date1 = simpleDateFormat1.parse(datas[1]);
time1 = simpleDateFormat2.format(date1);
date2 = simpleDateFormat1.parse(datas[2]);
time2 = simpleDateFormat2.format(date2);
} catch (ParseException e) {
e.printStackTrace();
}
longitude = Double.parseDouble(datas[8]) - Double.parseDouble(datas[7]);
latitiude = Double.parseDouble(datas[10]) - Double.parseDouble(datas[9]);
k.set(time1, time2, Integer.parseInt(datas[3]), datas[4], datas[5], datas[6], longitude, latitiude);
context.write(k, NullWritable.get());
}
}
自定义outputFormat
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyOutputFormat extends FileOutputFormat<JavaBean, NullWritable> {
public RecordWriter<JavaBean, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return new MyRecordWriter(job);
}
}
自定义RecordWriter
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
public class MyRecordWriter extends RecordWriter<JavaBean, NullWritable> {
BufferedWriter bw;
public MyRecordWriter(TaskAttemptContext taskAttemptContext) {
}
public void write(JavaBean key, NullWritable value) throws IOException, InterruptedException {
String city = key.getCity();
String path = "D:\\MP\\共享单车\\output1\\" + city + ".txt";
bw = new BufferedWriter(new FileWriter(path, true));
bw.write(key.toString());
bw.write("\n");
bw.flush();
}
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
bw.close();
}
}
Driver阶段
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.File;
public class DriTest {
public static void main(String[] args) throws Exception {
java.io.File file = new java.io.File("D:\\MP\\共享单车\\output2");
if (file.exists()) {
delFile(file);
driver();
} else {
driver();
}
}
public static void delFile(java.io.File file) {
File[] files = file.listFiles();
if (files != null && files.length != 0) {
for (int i = 0; i < files.length; i++) {
delFile(files[i]);
}
}
file.delete();
}
public static void driver() throws Exception {
Configuration conf = new Configuration();
// conf.set("fs.default","hdfs://192.168.0.155:9000");
Job job = Job.getInstance(conf);
job.setJarByClass(DriTest.class);
job.setMapperClass(MapTest.class);
job.setMapOutputKeyClass(JavaBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputFormatClass(MyOutputFormat.class);
FileInputFormat.setInputPaths(job, "D:\\MP\\共享单车\\input\\dataResources.txt");
FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\共享单车\\output2"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
结果
分类成功
id升序