- 数据分析

一共有13个字段,分别是出发地、目的价格、节省、路线名、酒店、房间、去程航司、去程方式、去程时间、回程航司、回程方式、回程时间
数据下载地址: 下载
- 需求
删除含有空值的数据
删除重复的数据
我们假设价格在1000-3500之间为合理值,去除价格异常的数据
节省>价格为异常,去除节省异常值
酒店只保留名称,其他的多余信息删除
代码实现
自定义类
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Map_Trip implements WritableComparable<Map_Trip> {
private String start_loc;
private String end_loc;
private String price;
private String save;
private String line;
private String hotel;
private String room;
private String start_company;
private String start_type;
private String start_time;
private String end_comany;
private String end_type;
private String end_time;
@Override
public int compareTo(Map_Trip o) {
return 0;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(start_loc);
dataOutput.writeUTF(end_loc);
dataOutput.writeUTF(price);
dataOutput.writeUTF(save);
dataOutput.writeUTF(line);
dataOutput.writeUTF(hotel);
dataOutput.writeUTF(room);
dataOutput.writeUTF(start_company);
dataOutput.writeUTF(start_type);
dataOutput.writeUTF(start_time);
dataOutput.writeUTF(end_comany);
dataOutput.writeUTF(end_type);
dataOutput.writeUTF(end_time);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
start_loc = dataInput.readUTF();
end_loc = dataInput.readUTF();
price = dataInput.readUTF();
save = dataInput.readUTF();
line = dataInput.readUTF();
hotel = dataInput.readUTF();
room = dataInput.readUTF();
start_company = dataInput.readUTF();
start_type = dataInput.readUTF();
start_time = dataInput.readUTF();
end_comany = dataInput.readUTF();
end_type = dataInput.readUTF();
end_time = dataInput.readUTF();
}
public void set(String start_loc, String end_loc, String price, String save, String line, String hotel, String room, String start_company, String start_type, String start_time, String end_comany, String end_type, String end_time) {
this.start_loc = start_loc;
this.end_loc