hadoop分组只能连续的数据分组问题分析
一,问题发现
需求:假设一组数据 有ip和时间 现在需要根据时间排序根据ip分组
代码实现如下
数据 后面多余字段省略
true 58.215.204.118 - 2018-11-01 06:52:27
true 58.215.204.118 - 2018-11-01 06:51:36
true 58.215.204.118 - 2018-11-01 06:51:37
true 58.248.178.212 - 2018-11-01 06:51:37
true 58.248.178.212 - 2018-11-01 06:51:40
true 58.215.204.118 - 2018-11-01 06:52:26
true 58.215.204.118 - 2018-11-01 06:52:27
true 58.215.204.118 - 2018-11-01 06:52:27
true 58.215.204.118 - 2018-11-01 06:51:36
bean对象
/**
* 对接外部数据的层,表结构定义最好跟外部数据源保持一致
* 术语: 贴源表
* @author
*
*/
public class WebLogBean implements WritableComparable<WebLogBean> {
public static SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US);
private boolean valid = true;// 判断数据是否合法
private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
public void set(boolean valid,String remote_addr, String remote_user, String time_local, String request, String status, String body_bytes_sent, String http_referer, String http_user_agent) {
this.valid = valid;
this.remote_addr = remote_addr;
this.remote_user = remote_user;
this.time_local = time_local;
this.request = request;
this.status = status;
this.body_bytes_sent = body_bytes_sent;
this.http_referer = http_referer;
this.http_user_agent = http_user_agent;
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return this.time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.valid);
sb.append("\001").append(this.getRemote_addr());
sb.append("\001").append(this.getRemote_user());
sb.append("\001").append(this.getTime_local());
sb.append("\001").append(this.getRequest());
sb.append("\001").append(this.getStatus());
sb.append("\001").append(this.getBody_bytes_sent());
sb.append("\001").append(this.getHttp_referer());
sb.append("\001").append(this.getHttp_user_agent());
return sb.toString();
}
@Override
public void readFields(DataInput in) throws IOException {
this.valid = in.readBoolean();
this.remote_addr = in.readUTF();
this.remote_user = in.readUTF();
this.time_local = in.readUTF();
this.request = in.readUTF();
this.status = in.readUTF();
this.body_bytes_sent = in.readUTF();
this.http_referer = in.readUTF();
this.http_user_agent = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean(this.valid);
out.writeUTF(null==remote_addr?"":remote_addr);
out.writeUTF(null==remote_user?"":remote_user);
out.writeUTF(null==time_local?"":time_local);
out.writeUTF(null==request?"":request);
out.writeUTF(null==status?"":status);
out.writeUTF(null==body_bytes_sent?"":body_bytes_sent);
out.writeUTF(null==http_referer?"":http_referer);
out.writeUTF(null==http_user_agent?"":http_user_agent);
}
/**
* @Description: 实现根据日期排序排序
* @Param null
* @return:
* @Author: liangfeng
* @Date: 2019-07-12 10:11
*/
@Override
public int compareTo(WebLogBean o) {
if(remote_addr.equals(o.getRemote_addr())){
try {
Date data1 = simpleDateFormat.parse(time_local);
Date date2 = simpleDateFormat.parse(o.getTime_local());
return data1.compareTo(date2);
} catch (ParseException e) {
e.printStackTrace();
}
}
return 0;
//return remote_addr.compareTo(o.getRemote_addr());
}
}
mapper
**
* @ClassName 读取数据写入webLogBean中
* @Description TODO
* @Author liangfeng
* @Date 2019-07-09 23:08
* @Version 1.0
**/
public class Job16Mapper extends Mapper<LongWritable, Text, WebLogBean, WebLogBean> {
private WebLogBean webLogBean;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
webLogBean = new WebLogBean();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] fields = s.split("\001");
if("true".equals(fields[0])){
webLogBean.set(true, fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], fields[8]);
context.write(webLogBean,webLogBean);
}else{
System.out.println("无效数据");
}
}
}
group分组
/**
* @ClassName Job16Group
* @Description 根据ip分组
* @Author liangfeng
* @Date 2019-07-10 00:22
* @Version 1.0
**/
public class Job16Group extends WritableComparator{
public Job16Group(){
super(WebLogBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
WebLogBean webLogBean1 = (WebLogBean) a;
WebLogBean webLogBean2 = (WebLogBean) b;
System.out.println("addr:"+webLogBean1.getRemote_addr()+":"+webLogBean2.getRemote_addr()+":"+webLogBean1.getRemote_addr().compareTo(webLogBean2.getRemote_addr()));
return webLogBean1.getRemote_addr().compareTo(webLogBean2.getRemote_addr());
}
}
reduce代码
/**
* @ClassName Job16Reduce
* @Description 输出排序和分组后每组的数据
* @Author liangfeng
* @Date 2019-07-10 00:24
* @Version 1.0
**/
public class Job16Reduce extends Reducer<WebLogBean, WebLogBean, PageViewsBean,NullWritable> {
private PageViewsBean pageViewsBean;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
pageViewsBean = new PageViewsBean();
}
@Override
protected void reduce(WebLogBean key, Iterable<WebLogBean> values, Context context) throws IOException, InterruptedException {
//遍历添加数据到集合中
List<WebLogBean> webLogBeans = new ArrayList<WebLogBean>();
for (WebLogBean value : values) {
WebLogBean webLogBean = new WebLogBean();
System.out.println(key.getRemote_addr()+":"+key.getTime_local());
try {
BeanUtils.copyProperties(webLogBean,value);
} catch (Exception e) {
e.printStackTrace();
}
webLogBeans.add(webLogBean);
}
System.out.println("一组------------------");
}
}
输出结果
一组
//addr:58.215.204.118:58.215.204.118:0
58.215.204.118:2018-11-01 06:51:36
//addr:58.215.204.118:58.215.204.118:0
58.215.204.118:2018-11-01 06:51:37
//addr:58.215.204.118:58.248.178.212:-3
58.215.204.118:2018-11-01 06:52:27
二组------------------
//addr:58.248.178.212:58.248.178.212:0
58.248.178.212:2018-11-01 06:51:37
//addr:58.248.178.212:58.215.204.118:3
58.248.178.212:2018-11-01 06:51:40
三组------------------
//addr:58.215.204.118:58.215.204.118:0
58.215.204.118:2018-11-01 06:51:36
//addr:58.215.204.118:58.215.204.118:0
58.215.204.118:2018-11-01 06:52:26
//addr:58.215.204.118:58.215.204.118:0
58.215.204.118:2018-11-01 06:52:27
58.215.204.118:2018-11-01 06:52:27
组------------------
结果分析
addr是分组比较的日志
观察组内时间排序都没有问题
但是观察第一组和第三组发现ip相同但是不在同一组
再和原始数据进行对比你会发现 分组只是把连续ip相同的数据划分为一组中间如果有其他ip的话一组就结束
观察addr的比较日志可以猜测分组是连续两条数据对比来分组的如果上一条和下一条数据不相同则分组结束,尽管后面还有相同数据也不把他们分为一组。
二,源码分析
我们带着猜想进行源码分析
主要是查看源码中如何调用分组的比较器实现分组的
job中设置啦GroupingComparatorClass
job.setGroupingComparatorClass(Job16Group.class);
那我们就全局搜下在哪getGroupingComparatorClass并使用的
ReduceTask中发现如下代码
//1,获取自定义分组比较器
RawComparator comparator = job.getOutputValueGroupingComparator();
//2,执行reduce
if (useNewApi) {
runNewReducer(job, umbilical, reporter, rIter, comparator,
keyClass, valueClass);
} else {
runOldReducer(job, umbilical, reporter, rIter, comparator,
keyClass, valueClass);
}
....
//去runNewReducer看看
//创建reducerContext 传入comparator
reducerContext = createReduceContext(reducer, job, getTaskID(),
rIter, reduceInputKeyCounter,
reduceInputValueCounter,
trackedRW,
committer,
reporter, comparator, keyClass,
valueClass);
try {
//执行reduce 调用reducerContext的getvalues方法传入到reduce中
reducer.run(reducerContext);
} finally {
trackedRW.close(reducerContext);
}
//去看看createReduceContext方法 创建啦 ReduceContextImpl对象传入比较器
createReduceContext(org.apache.hadoop.mapreduce.Reducer
<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer,
Configuration job,
org.apache.hadoop.mapreduce.TaskAttemptID taskId,
RawKeyValueIterator rIter,
org.apache.hadoop.mapreduce.Counter inputKeyCounter,
org.apache.hadoop.mapreduce.Counter inputValueCounter,
org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> output,
org.apache.hadoop.mapreduce.OutputCommitter committer,
org.apache.hadoop.mapreduce.StatusReporter reporter,
RawComparator<INKEY> comparator,
Class<INKEY> keyClass, Class<INVALUE> valueClass
) throws IOException, InterruptedException {
org.apache.hadoop.mapreduce.ReduceContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
reduceContext =
new ReduceContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, taskId,
rIter,
inputKeyCounter,
inputValueCounter,
output,
committer,
reporter,
comparator,
keyClass,
valueClass);
org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
reducerContext =
new WrappedReducer<INKEY, INVALUE, OUTKEY, OUTVALUE>().getReducerContext(
reduceContext);
return reducerContext;
}
上面代码看到comparator 被传入到啦ReduceContextImpl中我们去ReduceContextImpl中查看代码
public class ReduceContextImpl<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
extends TaskInputOutputContextImpl<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
implements ReduceContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
private RawKeyValueIterator input;
private Counter inputValueCounter;
private Counter inputKeyCounter;
private RawComparator<KEYIN> comparator;
private KEYIN key; // current key
private VALUEIN value; // current value
private boolean firstValue = false; // first value in key
private boolean nextKeyIsSame = false; // more w/ this key
private boolean hasMore; // more in file
protected Progressable reporter;
private Deserializer<KEYIN> keyDeserializer;
private Deserializer<VALUEIN> valueDeserializer;
private DataInputBuffer buffer = new DataInputBuffer();
private BytesWritable currentRawKey = new BytesWritable();
private ValueIterable iterable = new ValueIterable();
private boolean isMarked = false;
private BackupStore<KEYIN,VALUEIN> backupStore;
private final SerializationFactory serializationFactory;
private final Class<KEYIN> keyClass;
private final Class<VALUEIN> valueClass;
private final Configuration conf;
private final TaskAttemptID taskid;
private int currentKeyLength = -1;
private int currentValueLength = -1;
public ReduceContextImpl(Configuration conf, TaskAttemptID taskid,
RawKeyValueIterator input,
Counter inputKeyCounter,
Counter inputValueCounter,
RecordWriter<KEYOUT,VALUEOUT> output,
OutputCommitter committer,
StatusReporter reporter,
RawComparator<KEYIN> comparator,
Class<KEYIN> keyClass,
Class<VALUEIN> valueClass
) throws InterruptedException, IOException{
super(conf, taskid, output, committer, reporter);
this.input = input;
this.inputKeyCounter = inputKeyCounter;
this.inputValueCounter = inputValueCounter;
this.comparator = comparator;
this.serializationFactory = new SerializationFactory(conf);
this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
this.keyDeserializer.open(buffer);
this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
this.valueDeserializer.open(buffer);
hasMore = input.next();
this.keyClass = keyClass;
this.valueClass = valueClass;
this.conf = conf;
this.taskid = taskid;
}
/** Start processing next unique key. */
public boolean nextKey() throws IOException,InterruptedException {
while (hasMore && nextKeyIsSame) {
nextKeyValue();
}
if (hasMore) {
if (inputKeyCounter != null) {
inputKeyCounter.increment(1);
}
return nextKeyValue();
} else {
return false;
}
}
/**
* Advance to the next key/value pair.
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!hasMore) {
key = null;
value = null;
return false;
}
firstValue = !nextKeyIsSame;
DataInputBuffer nextKey = input.getKey();
currentRawKey.set(nextKey.getData(), nextKey.getPosition(),
nextKey.getLength() - nextKey.getPosition());
buffer.reset(currentRawKey.getBytes(), 0, currentRawKey.getLength());
key = keyDeserializer.deserialize(key);
DataInputBuffer nextVal = input.getValue();
buffer.reset(nextVal.getData(), nextVal.getPosition(), nextVal.getLength()
- nextVal.getPosition());
value = valueDeserializer.deserialize(value);
currentKeyLength = nextKey.getLength() - nextKey.getPosition();
currentValueLength = nextVal.getLength() - nextVal.getPosition();
if (isMarked) {
backupStore.write(nextKey, nextVal);
}
hasMore = input.next();
if (hasMore) {
nextKey = input.getKey();
nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0,
currentRawKey.getLength(),
nextKey.getData(),
nextKey.getPosition(),
nextKey.getLength() - nextKey.getPosition()
) == 0;
} else {
nextKeyIsSame = false;
}
inputValueCounter.increment(1);
return true;
}
public KEYIN getCurrentKey() {
return key;
}
@Override
public VALUEIN getCurrentValue() {
return value;
}
BackupStore<KEYIN,VALUEIN> getBackupStore() {
return backupStore;
}
protected class ValueIterator implements ReduceContext.ValueIterator<VALUEIN> {
private boolean inReset = false;
private boolean clearMarkFlag = false;
@Override
public boolean hasNext() {
try {
if (inReset && backupStore.hasNext()) {
return true;
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("hasNext failed", e);
}
return firstValue || nextKeyIsSame;
}
@Override
public VALUEIN next() {
if (inReset) {
try {
if (backupStore.hasNext()) {
backupStore.next();
DataInputBuffer next = backupStore.nextValue();
buffer.reset(next.getData(), next.getPosition(), next.getLength()
- next.getPosition());
value = valueDeserializer.deserialize(value);
return value;
} else {
inReset = false;
backupStore.exitResetMode();
if (clearMarkFlag) {
clearMarkFlag = false;
isMarked = false;
}
}
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("next value iterator failed", e);
}
}
// if this is the first record, we don't need to advance
if (firstValue) {
firstValue = false;
return value;
}
// if this isn't the first record and the next key is different, they
// can't advance it here.
if (!nextKeyIsSame) {
throw new NoSuchElementException("iterate past last value");
}
// otherwise, go to the next key/value pair
try {
nextKeyValue();
return value;
} catch (IOException ie) {
throw new RuntimeException("next value iterator failed", ie);
} catch (InterruptedException ie) {
// this is bad, but we can't modify the exception list of java.util
throw new RuntimeException("next value iterator interrupted", ie);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException("remove not implemented");
}
@Override
public void mark() throws IOException {
if (getBackupStore() == null) {
backupStore = new BackupStore<KEYIN,VALUEIN>(conf, taskid);
}
isMarked = true;
if (!inReset) {
backupStore.reinitialize();
if (currentKeyLength == -1) {
// The user has not called next() for this iterator yet, so
// there is no current record to mark and copy to backup store.
return;
}
assert (currentValueLength != -1);
int requestedSize = currentKeyLength + currentValueLength +
WritableUtils.getVIntSize(currentKeyLength) +
WritableUtils.getVIntSize(currentValueLength);
DataOutputStream out = backupStore.getOutputStream(requestedSize);
writeFirstKeyValueBytes(out);
backupStore.updateCounters(requestedSize);
} else {
backupStore.mark();
}
}
@Override
public void reset() throws IOException {
// We reached the end of an iteration and user calls a
// reset, but a clearMark was called before, just throw
// an exception
if (clearMarkFlag) {
clearMarkFlag = false;
backupStore.clearMark();
throw new IOException("Reset called without a previous mark");
}
if (!isMarked) {
throw new IOException("Reset called without a previous mark");
}
inReset = true;
backupStore.reset();
}
@Override
public void clearMark() throws IOException {
if (getBackupStore() == null) {
return;
}
if (inReset) {
clearMarkFlag = true;
backupStore.clearMark();
} else {
inReset = isMarked = false;
backupStore.reinitialize();
}
}
/**
* This method is called when the reducer moves from one key to
* another.
* @throws IOException
*/
public void resetBackupStore() throws IOException {
if (getBackupStore() == null) {
return;
}
inReset = isMarked = false;
backupStore.reinitialize();
currentKeyLength = -1;
}
/**
* This method is called to write the record that was most recently
* served (before a call to the mark). Since the framework reads one
* record in advance, to get this record, we serialize the current key
* and value
* @param out
* @throws IOException
*/
private void writeFirstKeyValueBytes(DataOutputStream out)
throws IOException {
assert (getCurrentKey() != null && getCurrentValue() != null);
WritableUtils.writeVInt(out, currentKeyLength);
WritableUtils.writeVInt(out, currentValueLength);
Serializer<KEYIN> keySerializer =
serializationFactory.getSerializer(keyClass);
keySerializer.open(out);
keySerializer.serialize(getCurrentKey());
Serializer<VALUEIN> valueSerializer =
serializationFactory.getSerializer(valueClass);
valueSerializer.open(out);
valueSerializer.serialize(getCurrentValue());
}
}
protected class ValueIterable implements Iterable<VALUEIN> {
private ValueIterator iterator = new ValueIterator();
@Override
public Iterator<VALUEIN> iterator() {
return iterator;
}
}
/**
* Iterate through the values for the current key, reusing the same value
* object, which is stored in the context.
* @return the series of values associated with the current key. All of the
* objects returned directly and indirectly from this method are reused.
*/
public
Iterable<VALUEIN> getValues() throws IOException, InterruptedException {
return iterable;
}
}
其中主要看如下两个方法
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!hasMore) {
key = null;
value = null;
return false;
}
firstValue = !nextKeyIsSame;
DataInputBuffer nextKey = input.getKey();
currentRawKey.set(nextKey.getData(), nextKey.getPosition(),
nextKey.getLength() - nextKey.getPosition());
buffer.reset(currentRawKey.getBytes(), 0, currentRawKey.getLength());
key = keyDeserializer.deserialize(key);
DataInputBuffer nextVal = input.getValue();
buffer.reset(nextVal.getData(), nextVal.getPosition(), nextVal.getLength()
- nextVal.getPosition());
value = valueDeserializer.deserialize(value);
currentKeyLength = nextKey.getLength() - nextKey.getPosition();
currentValueLength = nextVal.getLength() - nextVal.getPosition();
if (isMarked) {
backupStore.write(nextKey, nextVal);
}
hasMore = input.next();
if (hasMore) {
nextKey = input.getKey();
//此处调用我们的比较器进行key值比较 比较的只是下一行数据的key值结果赋值给nextKeyIsSame
nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0,
currentRawKey.getLength(),
nextKey.getData(),
nextKey.getPosition(),
nextKey.getLength() - nextKey.getPosition()
) == 0;
} else {
nextKeyIsSame = false;
}
inputValueCounter.increment(1);
return true;
}
迭代器中方法
@Override
public boolean hasNext() {
try {
if (inReset && backupStore.hasNext()) {
return true;
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("hasNext failed", e);
}
return firstValue ||nextKeyIsSame;
//firstValue 表示是否是第一个值 判断是否有值会默认先去一行数据。
//nextKeyIsSame 和下一个key值是否相同 相同则可以继续遍历
//也就是说按照行顺序比较相同的为一组
}
三,总结
如上源码分析可以看出:
group分组其实就是把连续相同的key放到一组中不连续但是key相同的数据还是不在一组中
所有我们进行不同字段排序分组的时候需要注意
上面案例如果想要实现需求必须先把相同的ip连续存放才不会出现问题