/**数据
* userA,locationA,2018-01-01 08:00:00,60
userA,locationA,2018-01-01 09:00:00,60
userA,locationB,2018-01-01 10:00:00,60
userA,locationA,2018-01-01 11:00:00,60
*
*
最终结果
userA,locationA,2018-01-01 08:00:00,120
userA,locationB,2018-01-01 10:00:00,60 //注意时间依旧正序
userA,locationA,2018-01-01 11:00:00,60
*
* 基本思路,需要两个mr
* mr1 :
* ①按用户 ,位置,时间,正序排序,
* ②按用户 ,位置 分组
* ③在ruducer中的Iterable取第一条数据时间+分钟(换算成时间戳),与下一条时间(换算成时间戳)对比,
* 相同继续循环增加时间,不相同则输出之前的,取最新的一条数据作为新起点进行对比
* 分组后输出效果
* userA,locationA,2018-01-01 08:00:00,120
userA,locationA,2018-01-01 11:00:00,60 //讲分组放到一起,不能按时间正序
userA,locationB,2018-01-01 10:00:00,60
* mr2 :
* 根据上次输出的数据再跑一次mr,这次按时间正序排序即可(不用分组)
*
*/
代码如下
定义bean对象
package com.dxt.dingwei;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DingWeiBean implements WritableComparable<DingWeiBean>{
private String user;
private String location;
private String time;
private int minuit;
@Override
public String toString() {
return user + "," +location+","+ time + "," + minuit;
}
public void set(String[] split){
this.setUser(split[0]);
this.setLocation(split[1]);
this.setTime(split[2]);
this.setMinuit(Integer.parseInt(split[3]));
}
public void set(DingWeiBean dwb){
this.setUser(dwb.getUser());
this.setLocation(dwb.getLocation() );
this.setTime(dwb.getTime());
this.setMinuit(dwb.getMinuit());
}
public DingWeiBean() {
super();
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getLocation() {
return location;
}
public void setLocation(String location) {
this.location = location;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public int getMinuit() {
return minuit;
}
public void setMinuit(int minuit) {
this.minuit = minuit;
}
/**
* 比较 用户名,位置,时间 ,升序排序
* @param o
* @return
*/
public int compareTo(DingWeiBean o) {
int comp_user=o.user.compareTo(this.user);
if (comp_user==0){
int comp_loc=o.location.compareTo(this.location);
if(comp_loc==0){
int comp_time=o.time.compareTo(this.time);
if(comp_time==0){
return 0;
}else {
return comp_time>0?-1:1;
}
}else {
return comp_loc>0?-1:1;
}
}else{
return comp_user>0?-1:1;
}
}
public void write(DataOutput out) throws IOException {
out.writeUTF(this.user);
out.writeUTF(this.location);
out.writeUTF(this.time);
out.writeInt(this.minuit);
}
public void readFields(DataInput in) throws IOException {
this.user=in.readUTF();
this.location=in.readUTF();
this.time=in.readUTF();
this.minuit=in.readInt();
}
}
定义分组条件
package com.dxt.dingwei;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 分组操作,将user , location 相同的分到一个组中
*/
public class DingWeiCompartor extends WritableComparator {
public DingWeiCompartor() {
super(DingWeiBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
DingWeiBean a1 = (DingWeiBean) a;
DingWeiBean b1 = (DingWeiBean) b;
int comp_user = a1.getUser().compareTo(b1.getUser());
if (comp_user == 0) {
int comp_loc = a1.getLocation().compareTo(b1.getLocation());
if (comp_loc == 0) {
return 0;
} else {
return comp_loc > 0 ? -1 : 1;
}
} else {
return comp_user > 0 ? -1 : 1;
}
}
}
mapper
package com.dxt.dingwei;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class DingWeiMapper extends Mapper<LongWritable, Text, DingWeiBean, NullWritable> {
//获取数据并处理
// userA,locationA,2018-01-01 08:00:00,60
// userA,locationA,2018-01-01 09:00:00,60
// userA,locationB,2018-01-01 10:00:00,60
// userA,locationA,2018-01-01 11:00:00,60
DingWeiBean dwb = new DingWeiBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
System.out.println("=========="+split.length+"==================");
dwb.set(split);
context.write(dwb,NullWritable.get());
}
}
Reducer
package com.dxt.dingwei;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
public class DingweiReducer extends Reducer<DingWeiBean,NullWritable,DingWeiBean,NullWritable> {
DingWeiBean dwb=new DingWeiBean();
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@Override
protected void reduce(DingWeiBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//数据在这里已经进行了分组并排序,将相同用户,相同位置,升序时间排序的数据集合到了一起
// userA,locationA,2018-01-01 08:00:00,60
// userA,locationA,2018-01-01 09:00:00,60
// userA,locationA,2018-01-01 11:00:00,60
// userA,locationA,2018-01-01 12:00:00,60
int count=0;
for (NullWritable value : values) {
//因为升序 取第一个数据作为开始
count++;
if(count==1){
//第一条,拿出来,复制给bean对象
dwb.set(key);
}else{
//从第二条记录开始循环
//将日期转换成时间戳进行对比
try {
long curr_timestemp=sdf.parse(key.getTime()).getTime();
//计算第一次(或者上次记录的时间)时间戳+等待的分钟
long timestemp=sdf.parse(dwb.getTime()).getTime()+dwb.getMinuit()*60*1000;
if(curr_timestemp==timestemp){
//如果相等,说明时间是连续的,,不相等说明不连续,不连续就重新开始将那一条作为第一条重复进行
//相等,时间相加
dwb.setMinuit(dwb.getMinuit()+key.getMinuit());
}else{
context.write(dwb,NullWritable.get());//将已经连续的数据写出去
//将出现一个新的不连续数据作为第一条
dwb.set(key);
}
} catch (ParseException e) {
e.printStackTrace();
}
}
}
//最后写出所有的
context.write(dwb,NullWritable.get());
}
}
Driver
public class DingWei {
public static void main(String[] args) {
Configuration configuration = new Configuration();
try {
Job job = Job.getInstance(configuration);
//驱动类
job.setJarByClass(DingWei.class);
job.setMapperClass(DingWeiMapper.class);
job.setReducerClass(DingweiReducer.class);
//map输出格式
job.setMapOutputKeyClass(DingWeiBean.class);
job.setMapOutputValueClass(NullWritable.class);
//redeucer最终输出格式
job.setOutputKeyClass(DingWeiBean.class);
job.setOutputValueClass(DingWeiBean.class);
job.setGroupingComparatorClass(DingWeiCompartor.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job, new Path("E:\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\outDabwei"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
mr执行完成输出数据格式为,不符合要求
userA,locationA,2018-01-01 08:00:00,120
userA,locationA,2018-01-01 11:00:00,60
userA,locationB,2018-01-01 10:00:00,60
mr2的主要流程是在mr1的基础上对时间排序,mr1的输出作为mr2的输入
//bean对象自定义时间排序
/**
* 比较,只需要正序比较时间就行
* @param o
* @return
*/
public int compareTo(DingweiBean2 o) {
int comp_time = o.time.compareTo(this.time);
return comp_time>0?-1:1;
}
//map,reducer直接write
最终数据
userA,locationA,2018-01-01 08:00:00,120
userA,locationB,2018-01-01 10:00:00,60
userA,locationA,2018-01-01 11:00:00,60