MapReduce----电信数据清洗

数据解析及题目分析

数据解析


数据一

18620192711,15733218050,1506628174,1506628265,650000,810000
18641241020,15733218050,1509757276,1509757464,330000,620000
15778423030,15614201525,1495290451,1495290923,370000,420000
13341109505,15151889601,1492661762,1492662200,330000,460000
13341109505,13666666666,1470111026,1470111396,360000,230000
15032293356,13799999999,1495937181,1495937360,500000,630000
15733218050,13341109505,1452601976,1452602401,620000,530000
13269361119,13269361119,1487640690,1487641023,450000,430000
13799999999,15338595369,1511928814,1511929111,540000,230000
15733218050,15778423030,1542457633,1542457678,450000,530000
13341109505,17731088562,1484364844,1484365342,460000,360000
18332562075,15778423030,1522426275,1522426473,140000,120000
13560190665,18301589432,1485648596,1485648859,620000,820000
15733218050,13520404983,1538992531,1538992605,130000,150000
15778423030,13566666666,1484008721,1484009210,810000,330000
13566666666,17731088562,1541812913,1541813214,220000,360000
15778423030,15733218050,1464198621,1464198803,630000,340000
15151889601,13341109505,1467441052,1467441538,640000,440000
18620192711,13666666666,1510997876,1510998253,450000,610000
13341109505,18641241020,1509074946,1509075201,710000,310000
17731088562,13341109505,1471571270,1471571706,430000,630000
13520404983,13560190665,1476626194,1476626683,500000,440000
15338595369,13341109505,1523996031,1523996059,420000,460000
15151889601,13341109505,1489658199,1489658394,330000,500000
13560190665,15338595369,1510890681,1510891129,410000,520000
15733218050,13566666666,1503498540,1503498726,420000,310000
17731088562,13560190665,1470571255,1470571708,540000,330000
15338595369,15614201525,1496767879,1496768364,520000,500000
17731088562,15778423030,1494602567,1494602784,500000,420000
15778423030,18641241020,1517445007,1517445358,450000,530000
13566666666,17731088562,1464697765,1464697894,360000,620000
15778423030,13799999999,1525543218,1525543493,500000,820000
13341109505,13520404983,1521861238,1521861421,500000,130000
13566666666,13560190665,1513918160,1513918538,340000,210000
15032293356,18620192711,1485688388,1485688537,540000,530000
13799999999,13341109505,1531196363,1531196438,230000,320000
15338595369,15151889601,1512125514,1512125978,540000,810000
18332562075,13560190665,1523311951,1523312239,650000,410000
15778423030,15032293356,1467953782,1467954054,810000,540000
15151889601,15733218050,1530848147,1530848231,310000,150000
13269361119,18301589432,1541271874,1541272273,310000,310000
15032293356,15338595369,1520833915,1520834201,450000,360000
15778423030,13269361119,1452817391,1452817596,820000,410000
13520404983,18332562075,1474563316,1474563593,710000,540000
18301589432,15778423030,1473596284,1473596528,620000,310000
15732648446,15151889601,1535584645,1535585117,530000,310000
18301589432,13269361119,1511910316,1511910341,340000,320000
13560190665,18641241020,1533379659,1533379717,120000,710000
15338595369,18332562075,1474152847,1474153092,330000,500000
13520404983,17731088562,1504907456,1504907617,820000,510000
15732648446,18301589432,1521692836,1521692977,220000,370000
15032293356,15614201525,1471445293,1471445756,360000,530000
18641241020,15778423030,1517192728,1517193050,210000,610000
17731088562,15733218050,1493420249,1493420555,370000,820000
18620192711,13799999999,1477952709,1477953088,310000,140000
13666666666,13799999999,1541066076,1541066541,230000,640000
13269361119,17731088562,1540060141,1540060511,150000,540000
18332562075,13799999999,1489772390,1489772817,540000,710000
13799999999,15732648446,1503882021,1503882332,530000,520000
13566666666,15614201525,1504983084,1504983241,820000,140000
18641241020,15032293356,1463447030,1463447080,330000,640000
18301589432,13566666666,1493646451,1493646796,310000,510000
15732648446,15032293356,1537185125,1537185619,430000,810000
15338595369,13341109505,1493411872,1493411891,370000,150000
15778423030,17731088562,1540631847,1540632271,320000,500000
13666666666,15614201525,1545200734,1545200959,360000,640000
15032293356,13799999999,1455000970,1455001084,460000,650000
18641241020,18620192711,1529968498,1529968626,410000,510000
17731088562,15732648446,1455361378,1455361505,440000,650000
18301589432,13666666666,1518564232,1518564421,210000,640000
15733218050,18620192711,1515672794,1515673149,360000,360000
13520404983,18620192711,1521620546,1521620913,820000,370000
18332562075,18641241020,1498131159,1498131300,820000,230000
13666666666,18301589432,1491354142,1491354544,220000,710000
18301589432,15614201525,1511731560,1511732015,810000,620000
13269361119,13666666666,1539065031,1539065096,810000,810000
15778423030,18641241020,1518364528,1518364995,130000,610000
15733218050,15032293356,1491974898,1491975316,340000,810000
13269361119,15733218050,1543514850,1543514946,410000,460000
13341109505,13666666666,1482223100,1482223577,220000,410000
15338595369,13341109505,1495958992,1495959292,330000,420000
13341109505,18641241020,1511010003,1511010292,540000,620000
18620192711,13269361119,1462453298,1462453559,320000,360000
13666666666,13799999999,1518047527,1518047967,640000,420000
13341109505,13666666666,1474872886,1474872907,360000,510000
13666666666,18641241020,1473575493,1473575663,150000,520000
15151889601,15732648446,1509418483,1509418891,510000,540000
13560190665,13520404983,1467696946,1467697103,150000,460000
13520404983,15614201525,1510958686,1510959064,320000,610000
15778423030,15614201525,1470012457,1470012660,210000,210000
15778423030,17731088562,1542680029,1542680382,630000,520000
18332562075,15338595369,1453896030,1453896522,640000,370000
15032293356,18620192711,1488286898,1488287248,530000,150000
18641241020,15733218050,1489804133,1489804185,150000,630000
15733218050,13666666666,1506782751,1506782854,220000,500000
13520404983,17731088562,1487421622,1487421784,230000,330000
15151889601,13269361119,1538113862,1538113902,370000,630000
15778423030,17731088562,1466691118,1466691412,540000,530000
15032293356,13520404983,1521151509,1521151701,520000,430000
15614201525,13666666666,1464083166,1464083352,330000,650000

字段解析:呼叫者手机号,接受者手机号,开始时间戳,接受时间戳,呼叫者地址省份编码,接受者地址省份编码


数据二

1,110000,北京市
2,120000,天津市
3,130000,河北省
4,140000,山西省
5,150000,内蒙古自治区
6,210000,辽宁省
7,220000,吉林省
8,230000,黑龙江省
9,310000,上海市
10,320000,江苏省
11,330000,浙江省
12,340000,安徽省
13,350000,福建省
14,360000,江西省
15,370000,山东省
16,410000,河南省
17,420000,湖北省
18,430000,湖南省
19,440000,广东省
20,450000,广西壮族自治区
21,460000,海南省
22,500000,重庆市
23,510000,四川省
24,520000,贵州省
25,530000,云南省
26,540000,西藏自治区
27,610000,陕西省
28,620000,甘肃省
29,630000,青海省
30,640000,宁夏回族自治区
31,650000,新疆维吾尔自治区
32,710000,台湾省
33,810000,香港特别行政区
34,820000,澳门特别行政区

字段解析:地址id,省份编码,省份名称


数据三

7,18000696806,赵贺彪
8,15151889601,张倩
9,13269361119,王世昌
10,15032293356,张涛
11,17731088562,张阳
12,15338595369,李进全
13,15733218050,杜泽文
14,15614201525,任宗阳
15,15778423030,梁鹏
16,18641241020,郭美彤
17,15732648446,刘飞飞
18,13341109505,段光星
19,13560190665,唐会华
20,18301589432,杨力谋
21,13520404983,温海英
22,18332562075,朱尚宽
23,18620192711,刘能宗
24,13566666666,刘柳
25,13666666666,邓二
26,13799999999,菜中路

字段解析:电话ID,电话号码,姓名


题目及分析


  1. 将电话号码替换成人名
  2. 将拨打、接听电话的时间戳转换成日期
  3. 求出电话的通话时间,以秒做单位
  4. 将省份编码替换成省份名称
  5. 最后数据的样例:
邓二,张倩,13666666666,15151889601,2018-03-29 10:58:12,2018-03-29 10:58:42,30,黑龙江省,上海市

  • 需求一和需求四可以将数据二和数据三缓存到内存里,然后进行替换操作
  • 需求二简单的时间类型转换
  • 需求三日期类型的加减

代码实现

自定义类

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Data implements WritableComparable<Data> {
    private String name_A;
    private String name_B;
    private String phoneNum_A;
    private String phoneNum_B;
    private String startTime;
    private String endTime;
    private String phoneLong;
    private String location_A;
    private String location_B;

    @Override
    public int compareTo(Data o) {
        return 0;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(name_A);
        dataOutput.writeUTF(name_B);
        dataOutput.writeUTF(phoneNum_A);
        dataOutput.writeUTF(phoneNum_B);
        dataOutput.writeUTF(startTime);
        dataOutput.writeUTF(endTime);
        dataOutput.writeUTF(phoneLong);
        dataOutput.writeUTF(location_A);
        dataOutput.writeUTF(location_B);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        name_A = dataInput.readUTF();
        name_B = dataInput.readUTF();
        phoneNum_A = dataInput.readUTF();
        phoneNum_B = dataInput.readUTF();
        startTime = dataInput.readUTF();
        endTime = dataInput.readUTF();
        phoneLong = dataInput.readUTF();
        location_A = dataInput.readUTF();
        location_B = dataInput.readUTF();
    }

    public void set(String name_A, String name_B, String phoneNum_A, String phoneNum_B, String startTime, String endTime, String phoneLong, String location_A, String location_B) {
        this.name_A = name_A;
        this.name_B = name_B;
        this.phoneNum_A = phoneNum_A;
        this.phoneNum_B = phoneNum_B;
        this.startTime = startTime;
        this.endTime = endTime;
        this.phoneLong = phoneLong;
        this.location_A = location_A;
        this.location_B = location_B;
    }

    @Override
    public String toString() {
        return name_A + "," +
                name_B + "," +
                phoneNum_A + "," +
                phoneNum_B + "," +
                startTime + "," +
                endTime + "," +
                phoneLong + "," +
                location_A + "," +
                location_B
                ;
    }

    public String getName_A() {
        return name_A;
    }

    public void setName_A(String name_A) {
        this.name_A = name_A;
    }

    public String getName_B() {
        return name_B;
    }

    public void setName_B(String name_B) {
        this.name_B = name_B;
    }

    public String getPhoneNum_A() {
        return phoneNum_A;
    }

    public void setPhoneNum_A(String phoneNum_A) {
        this.phoneNum_A = phoneNum_A;
    }

    public String getPhoneNum_B() {
        return phoneNum_B;
    }

    public void setPhoneNum_B(String phoneNum_B) {
        this.phoneNum_B = phoneNum_B;
    }

    public String getStartTime() {
        return startTime;
    }

    public void setStartTime(String startTime) {
        this.startTime = startTime;
    }

    public String getEndTime() {
        return endTime;
    }

    public void setEndTime(String endTime) {
        this.endTime = endTime;
    }

    public String getPhoneLong() {
        return phoneLong;
    }

    public void setPhoneLong(String phoneLong) {
        this.phoneLong = phoneLong;
    }

    public String getLocation_A() {
        return location_A;
    }

    public void setLocation_A(String location_A) {
        this.location_A = location_A;
    }

    public String getLocation_B() {
        return location_B;
    }

    public void setLocation_B(String location_B) {
        this.location_B = location_B;
    }
}

Map阶段

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;

public class MapTest extends Mapper<LongWritable, Text, Data, NullWritable> {
    private Data k = new Data();
    private Map<String, String> userName = new HashMap<String, String>();
    private Map<String, String> location = new HashMap<String, String>();
    //用户姓名
    private String name_A;
    private String name_B;
    //用户地址
    private String loc_A;
    private String loc_B;
    //通话时间的转换
    private String startTime;
    private String endTime;
    //通话时间
    private String time;


    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        URI[] uris = context.getCacheFiles();
        File user = new File(uris[0]);
        String line;
        //缓存用户姓名信息
        BufferedReader br;
        br = new BufferedReader(new FileReader(user));
        while ((line = br.readLine()) != null) {
            userName.put(line.split(",")[1], line.split(",")[2]);
        }
        //缓存地址信息
        File loc = new File(uris[1]);
        br = new BufferedReader(new FileReader(loc));
        while ((line = br.readLine()) != null) {
            location.put(line.split(",")[1], line.split(",")[2]);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] data = value.toString().split(",");
        //将用户号码转换成姓名
        name_A = userName.get(data[0]);
        name_B = userName.get(data[1]);
        //将时间戳转换成日期类型
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        startTime = sdf.format(Long.parseLong(data[2]));
        endTime = sdf.format(Long.parseLong(data[3]));
        //计算通话时间
        time = Long.parseLong(data[3]) - Long.parseLong(data[2]) + "秒";
        //替换地址位置
        loc_A = location.get(data[4]);
        loc_B = location.get(data[5]);
        //写出数据
        k.set(name_A, name_B, data[0], data[1], startTime, endTime, time, loc_A, loc_B);
        context.write(k, NullWritable.get());
    }
}

Reduce阶段

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class RedTest extends Reducer<Data, NullWritable,Data,NullWritable> {
    @Override
    protected void reduce(Data key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        //直接写出数据即可
        for (NullWritable v:values){
            context.write(key,NullWritable.get());
        }
    }
}

Driver阶段

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.net.URI;

public class DriTest {
    public static void main(String[] args) throws Exception {
        File file = new File("D:\\MP\\电信\\output");
        if (file.exists()) {
            delFile(file);
            driver();
        } else {
            driver();
        }
    }

    public static void delFile(File file) {
        File[] files = file.listFiles();
        if (files != null && files.length != 0) {
            for (int i = 0; i < files.length; i++) {
                delFile(files[i]);
            }
        }
        file.delete();
    }

    public static void driver() throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setMapperClass(MapTest.class);
        job.setJarByClass(DriTest.class);
        job.setReducerClass(RedTest.class);

        job.setMapOutputKeyClass(Data.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Data.class);
        job.setOutputValueClass(NullWritable.class);

        URI [] uris = new URI[2];
        uris[0] = new URI("file:///D:/MP/电信/input/userPhone.txt");
        uris[1] = new URI("file:///D:/MP/电信/input/location.txt");
        job.setCacheFiles(uris);

        FileInputFormat.setInputPaths(job, "D:\\MP\\电信\\input\\data.txt");
        FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\电信\\output"));
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值