网约车大数据综合项目——基于MapReduce的数据清洗
第1关:网约车撤销订单数据清洗
package traffic.step1.mapreduce;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
public class DBHelper {
private static final String driver = "com.mysql.jdbc.Driver";
private static final String url = "jdbc:mysql://127.0.0.1:3306/trafficdb?useUnicode=true&characterEncoding=UTF-8";
private static final String username = "root";
private static final String password = "123123";
private static Connection conn = null;
static {
try {
Class.forName(driver);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public static Connection getConnection() {
if (conn == null) {
try {
conn = DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
return conn;
}
}
package traffic.step1.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
public class TrafficMap extends Mapper <LongWritable, Text,Text, Text> {
Map<String, String> addressMap = new HashMap<String, String>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Connection connection = DBHelper.getConnection();
try {
Statement statement = connection.createStatement();
String sql = "select * from t_address";
ResultSet resultSet = statement.executeQuery(sql);
while (resultSet.next()) {
String address_code = resultSet.getString(1);
String address_name = resultSet.getString(2);
addressMap.put(address_code, address_name);
}
} catch (SQLException e) {
e.printStackTrace();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] values = value.toString().split(",",8);
boolean flag=true;
for (String val : values){
if (val.equals("")){
flag=false;
}
}
if (values.length!=8){
flag=false;
}
if (!values[3].startsWith("20190307")||!values[4].startsWith("20190307")){
flag=false;
}
if (flag){
if (values[7].equals("null")){
values[7]="未知";
}
DateFormat df1 = new SimpleDateFormat("yyyyMMddHHmmss");
DateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
values[3]=df2.format(df1.parse(values[3]));
values[4]=df2.format(df1.parse(values[4]));
} catch (Exception e) {
System.out.println(e.getMessage());
}
String districtname="";
if (addressMap.containsKey(values[1])){
districtname=addressMap.get(values[1].trim());
}else {
districtname="未知";
}
String result = "";
for (int i=0;i<8;i++){
if (i==7){
result = result + values[i];
}else{
if (i==1){
result = result + values[i] + "|"+districtname + "|";
}else{
result = result + values[i] + "|";
}
}
}
context.write(new Text(values[2]),new Text(result));
}
}
}
package traffic.step1.mapreduce;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class TrafficReduce extends Reducer<Text, Text,NullWritable,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int num=0;
String result="";
for (Text val:values){
if (num==0){
result=val.toString();
num++;
context.write(NullWritable.get(),new Text(result));
}
}
}
}
package traffic.step1.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TrafficJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TrafficJob.class);
job.setMapperClass(TrafficMap.class);
job.setReducerClass(TrafficReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
String outputpath = "/root/files";
Path path= new Path(outputpath);
FileSystem fileSystem =path.getFileSystem(conf);
if (fileSystem.exists(path)){
fileSystem.delete(path,true);
}
FileInputFormat.addInputPath(job, new Path("/data/workspace/myshixun/data/ProvOrderCancel/*/"));
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}
第2关:网约车成功订单数据清洗
package traffic.step2;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
public class DBHelper {
private static final String driver = "com.mysql.jdbc.Driver";
private static final String url = "jdbc:mysql://localhost:3306/trafficdb?useUnicode=true&characterEncoding=UTF-8";
private static final String username = "root";
private static final String password = "123123";
private static Connection conn = null;
static {
try {
Class.forName(driver);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public static Connection getConnection() {
if (conn == null) {
try {
conn = DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
return conn;
}
}
package traffic.step2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
public class TrafficMap extends Mapper <LongWritable, Text,Text, Text> {
Map<String, String> addressMap = new HashMap<String, String>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Connection connection = DBHelper.getConnection();
try {
Statement statement = connection.createStatement();
String sql = "select * from t_address";
ResultSet resultSet = statement.executeQuery(sql);
while (resultSet.next()) {
String address_code = resultSet.getString(1);
String address_name = resultSet.getString(2);
addressMap.put(address_code, address_name);
}
} catch (SQLException e) {
e.printStackTrace();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] val = value.toString().split(",",14);
String vals="";
for(int i=0;i<val.length;i++){
if (i==5||i==12||i==13){
}else if (i==11){
vals = vals + val[i];
}else {
vals = vals + val[i] + ",";
}
}
String[] values= vals.split(",",11);
boolean flag=true;
if (values.length!=11){
flag=false;
}
for (String va : values){
if (!flag){break;}
if (va.equals("")){
flag=false;
}
}
try {
values[6]=values[6].substring(0,3)+"."+values[6].substring(3,values[6].trim().length());
values[7]=values[7].substring(0,2)+"."+values[7].substring(2,values[7].trim().length());
values[9]=values[9].substring(0,3)+"."+values[9].substring(3,values[9].trim().length());
values[10]=values[10].substring(0,2)+"."+values[10].substring(2,values[10].trim().length());
} catch (Exception e) {
flag=false;
}
if (!values[3].startsWith("20190307")||!values[4].startsWith("20190307")){
flag=false;
}
if (flag){
DateFormat df1 = new SimpleDateFormat("yyyyMMddHHmmss");
DateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
values[3]=df2.format(df1.parse(values[3]));
values[4]=df2.format(df1.parse(values[4]));
} catch (Exception e) {
System.out.println(e.getMessage());
}
String districtname="";
if (addressMap.containsKey(values[1])){
districtname=addressMap.get(values[1].trim());
}else {
districtname="未知";
}
String result = "";
for (int i=0;i<11;i++){
if (i==10){
result = result + values[i];
}else{
if (i==1){
result = result + values[i] + "\t"+districtname + "\t";
}else {
result = result + values[i] + "\t";
}
}
}
context.write(new Text(values[2]),new Text(result));
}
}
}
package traffic.step2;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class TrafficReduce extends Reducer<Text, Text,NullWritable,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int num=0;
String result="";
for (Text val:values){
if (num==0){
result=val.toString();
num++;
context.write(NullWritable.get(),new Text(result));
}
}
}
}
package traffic.step2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TrafficJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TrafficJob.class);
job.setMapperClass(TrafficMap.class);
job.setReducerClass(TrafficReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
String outputpath = "/root/files1";
Path path= new Path(outputpath);
FileSystem fileSystem =path.getFileSystem(conf);
if (fileSystem.exists(path)){
fileSystem.delete(path,true);
}
FileInputFormat.addInputPath(job, new Path("/data/workspace/myshixun/data/ProvOrderCreate/*/"));
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}