通过Mycat实现数据汇聚和排序不仅可以减少各分片和客户端之间数据传输IO同时也可以帮开发者从复杂的数据处理中解放出来专注开发业务代码。数据汇聚和排序在Query语句中使用也是比较频繁,由于汇聚和排序都涉及到数据排序操作,所以把他们放在同一小节中。
数据排序
在MySQL中的两种排序方式:一种利用有序索引获取有序数据,另外一种通过相应排序算法将获取到的数据在内存中进行排序,Mycat中数据排序采用堆排序法对多个分片返回有序数据进行合并之后再排序再返回到客户端,执行流程如下图:
相关类说明
RowDataSorter: 数据排序
RowDataCmp:字段对比
HeapItf:堆排序接口
MaxHeap:最大堆排序
代码解析
以下是排序算法调用入口以及数据返回
public class DataMergeService implements Runnable {
public void onRowMetaData(Map<String, ColMeta> columToIndx, int fieldCount) {
//判断是否包含order by字段
if (rrs.getOrderByCols() != null) {
LinkedHashMap<String, Integer> orders = rrs.getOrderByCols();
OrderCol[] orderCols = new OrderCol[orders.size()];
int i = 0;
for (Map.Entry<String, Integer> entry : orders.entrySet()) {
String key = StringUtil.removeBackquote(entry.getKey()
.toUpperCase());
ColMeta colMeta = columToIndx.get(key);
if (colMeta == null) {
throw new java.lang.IllegalArgumentException(
"all columns in order by clause should be in the selected column list!"
+ entry.getKey());
}
orderCols[i++] = new OrderCol(colMeta, entry.getValue());
}
// 调用排序算法
RowDataSorter tmp = new RowDataSorter(orderCols);
tmp.setLimit(rrs.getLimitStart(), rrs.getLimitSize());
sorter = tmp;
}
}
private List<RowDataPacket> getResults(byte[] eof) {
List<RowDataPacket> tmpResult = result;
if (this.grouper != null) {
tmpResult = grouper.getResult();
grouper = null;
}
if (sorter != null) {
// 处理grouper处理后的数据
if (tmpResult != null) {
Iterator<RowDataPacket> itor = tmpResult.iterator();
while (itor.hasNext()) {
sorter.addRow(itor.next());
itor.remove();
}
}
//返回order by处理后数据
tmpResult = sorter.getSortedResult();
sorter = null;
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("prepare mpp merge result for " + rrs.getStatement());
}
return tmpResult;
}
}
堆排序MaxHeap代码实现
public class MaxHeap implements HeapItf {
private RowDataCmp cmp;//order by字段对比
private List<RowDataPacket> data;//数据列表
public MaxHeap(RowDataCmp cmp, int size) {
this.cmp = cmp;
this.data = new ArrayList<>();
}
@Override
public void buildHeap() {
int len = data.size();
for (int i = len / 2 - 1; i >= 0; i--) {
heapifyRecursive(i, len);
}
}
// 递归版本
protected void heapifyRecursive(int i, int size) {
int l = left(i);
int r = right(i);
int max = i;
if (l < size && cmp.compare(data.get(l), data.get(i)) > 0)
max = l;
if (r < size && cmp.compare(data.get(r), data.get(max)) > 0)
max = r;
if (i == max)
return;
swap(i, max);
heapifyRecursive(max, size);
}
private int right(int i) {
return (i + 1) << 1;
}
private int left(int i) {
return ((i + 1) << 1) - 1;
}
//交换数据
private void swap(int i, int j) {
RowDataPacket tmp = data.get(i);
RowDataPacket elementAt = data.get(j);
data.set(i, elementAt);
data.set(j, tmp);
}
@Override
public RowDataPacket getRoot() {
return data.get(0);
}
@Override
public void setRoot(RowDataPacket root) {
data.set(0, root);
heapifyRecursive(0, data.size());
}
@Override
public List<RowDataPacket> getData() {
return data;
}
@Override
public void add(RowDataPacket row) {
data.add(row);
}
@Override
public boolean addIfRequired(RowDataPacket row) {
// 淘汰堆里最小的数据
RowDataPacket root = getRoot();
if (cmp.compare(row, root) < 0) {
setRoot(row);
return true;
}
return false;
}
@Override
public void heapSort(int size) {
final int total = data.size();
// 容错处理
if (size <= 0 || size > total) {
size = total;
}
final int min = size == total ? 0 : (total - size - 1);
// 末尾与头交换,交换后调整最大堆
for (int i = total - 1; i > min; i--) {
swap(0, i);
heapifyRecursive(0, i);
}
}
}
字段排序代码实现
public class RowDataCmp implements Comparator<RowDataPacket> {
private OrderCol[] orderCols;
public RowDataCmp(OrderCol[] orderCols) {
this.orderCols = orderCols;
}
@Override
public int compare(RowDataPacket o1, RowDataPacket o2) {
OrderCol[] tmp = this.orderCols;
int cmp = 0;
int len = tmp.length;
//依次比较order by语句上的多个排序字段的值
int type = OrderCol.COL_ORDER_TYPE_ASC;
for (int i = 0; i < len; i++) {
int colIndex = tmp[i].colMeta.colIndex;
byte[] left = o1.fieldValues.get(colIndex);
byte[] right = o2.fieldValues.get(colIndex);
if (tmp[i].orderType == type) {
cmp = RowDataPacketSorter.compareObject(left, right, tmp[i]);
} else {
cmp = RowDataPacketSorter.compareObject(right, left, tmp[i]);
}
if (cmp != 0)
return cmp;
}
return cmp;
}
}
数据汇聚
group by实际上也需要对数据进行排序,与order by相比group by多了排序后数据分组,在分组同时也可以使用一下聚合函数count、sum、max、min、avg。在MySql中group by实现有三种实现方式:第一种利用松散索(Loose)引扫描实现,第二种使用紧凑(Tight)索引扫描实现,第三种使用临时表实现。当所有分片返回已经汇聚好的数据时Mycat对返回所有数据进行合并汇聚再返回到客户端,执行流程如下图:
相关类说明
MergeCol: 聚合方法
ColMeta:聚合方法以及聚合类型
RowDataPacketGrouper:数据汇聚类
代码解析
以下是数据汇聚调用入口以及数据返回
public class DataMergeService implements Runnable {
public void onRowMetaData(Map<String, ColMeta> columToIndx, int fieldCount) {
if (rrs.isHasAggrColumn()) {
List<MergeCol> mergCols = new LinkedList<MergeCol>();
Map<String, Integer> mergeColsMap = rrs.getMergeCols();
if (mergeColsMap != null) {
for (Map.Entry<String, Integer> mergEntry : mergeColsMap
.entrySet()) {
String colName = mergEntry.getKey().toUpperCase();
int type = mergEntry.getValue();
if (MergeCol.MERGE_AVG == type) {
ColMeta sumColMeta = columToIndx.get(colName + "SUM");
ColMeta countColMeta = columToIndx.get(colName
+ "COUNT");
if (sumColMeta != null && countColMeta != null) {
ColMeta colMeta = new ColMeta(sumColMeta.colIndex,
countColMeta.colIndex,
sumColMeta.getColType());
mergCols.add(new MergeCol(colMeta, mergEntry
.getValue()));
}
} else {
ColMeta colMeta = columToIndx.get(colName);
mergCols.add(new MergeCol(colMeta, mergEntry.getValue()));
}
}
}
// add no alias merg column
for (Map.Entry<String, ColMeta> fieldEntry : columToIndx.entrySet()) {
String colName = fieldEntry.getKey();
int result = MergeCol.tryParseAggCol(colName);
if (result != MergeCol.MERGE_UNSUPPORT
&& result != MergeCol.MERGE_NOMERGE) {
mergCols.add(new MergeCol(fieldEntry.getValue(), result));
}
}
grouper = new RowDataPacketGrouper(groupColumnIndexs,
mergCols.toArray(new MergeCol[mergCols.size()]),
rrs.getHavingCols());
}
}
/**
* return merged data
*
* @return (最多i*(offset+size)行数据)
*/
private List<RowDataPacket> getResults(byte[] eof) {
List<RowDataPacket> tmpResult = result;
if (this.grouper != null) {
tmpResult = grouper.getResult();
grouper = null;
}
if (sorter != null) {
// 处理grouper处理后的数据
if (tmpResult != null) {
Iterator<RowDataPacket> itor = tmpResult.iterator();
while (itor.hasNext()) {
sorter.addRow(itor.next());
itor.remove();
}
}
tmpResult = sorter.getSortedResult();
sorter = null;
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("prepare mpp merge result for " + rrs.getStatement());
}
return tmpResult;
}
}
数据汇聚代码实现
public class RowDataPacketGrouper {
private List<RowDataPacket> result = Collections.synchronizedList(new ArrayList<RowDataPacket>());//汇聚结果集合
private final MergeCol[] mergCols;
private final int[] groupColumnIndexs;
private boolean isMergAvg=false;
private HavingCols havingCols;
public RowDataPacketGrouper(int[] groupColumnIndexs, MergeCol[] mergCols,HavingCols havingCols) {
super();
this.groupColumnIndexs = groupColumnIndexs;
this.mergCols = mergCols;
this.havingCols = havingCols;
}
public List<RowDataPacket> getResult() {
//是否采用聚类方法
if(!isMergAvg)
{
for (RowDataPacket row : result)
{
//对数据做聚合
mergAvg(row);
}
isMergAvg=true;
}
//过滤having条件数据
if(havingCols != null){
filterHaving();
}
return result;
}
//合并相同分组数据
public void addRow(RowDataPacket rowDataPkg) {
for (RowDataPacket row : result) {
if (sameGropuColums(rowDataPkg, row)) {
aggregateRow(row, rowDataPkg);
return;
}
}
// not aggreated ,insert new
result.add(rowDataPkg);
}
private void aggregateRow(RowDataPacket toRow, RowDataPacket newRow) {
if (mergCols == null) {
return;
}
for (MergeCol merg : mergCols) {
if(merg.mergeType!=MergeCol.MERGE_AVG)
{
byte[] result = mertFields(
toRow.fieldValues.get(merg.colMeta.colIndex),
newRow.fieldValues.get(merg.colMeta.colIndex),
merg.colMeta.colType, merg.mergeType);
if (result != null)
{
toRow.fieldValues.set(merg.colMeta.colIndex, result);
}
}
}
}
private void mergAvg(RowDataPacket toRow) {
if (mergCols == null) {
return;
}
for (MergeCol merg : mergCols) {
if(merg.mergeType==MergeCol.MERGE_AVG)
{
byte[] result = mertFields(
toRow.fieldValues.get(merg.colMeta.avgSumIndex),
toRow.fieldValues.get(merg.colMeta.avgCountIndex),
merg.colMeta.colType, merg.mergeType);
if (result != null)
{
toRow.fieldValues.set(merg.colMeta.avgSumIndex, result);
toRow.fieldValues.remove(merg.colMeta.avgCountIndex) ;
toRow.fieldCount=toRow.fieldCount-1;
}
}
}
}
//合并分组列数据
private byte[] mertFields(byte[] bs, byte[] bs2, int colType, int mergeType) {
if(bs2==null || bs2.length==0)
{
return bs;
}else if(bs==null || bs.length==0)
{
return bs2;
}
switch (mergeType) {
//sum聚类方法
case MergeCol.MERGE_SUM:
if (colType == ColMeta.COL_TYPE_NEWDECIMAL
|| colType == ColMeta.COL_TYPE_DOUBLE
|| colType == ColMeta.COL_TYPE_FLOAT
|| colType == ColMeta.COL_TYPE_DECIMAL) {
Double vale = ByteUtil.getDouble(bs) + ByteUtil.getDouble(bs2);
return vale.toString().getBytes();
}
//count聚类方法
case MergeCol.MERGE_COUNT: {
long s1 = Long.parseLong(new String(bs));
long s2 = Long.parseLong(new String(bs2));
long total = s1 + s2;
return LongUtil.toBytes(total);
}
//max聚类方法
case MergeCol.MERGE_MAX: {
int compare = ByteUtil.compareNumberByte(bs, bs2);
return (compare > 0) ? bs : bs2;
}
//min聚类方法
case MergeCol.MERGE_MIN: {
int compare = ByteUtil.compareNumberByte(bs, bs2);
return (compare > 0) ? bs2 : bs;
}
//avg聚类方法
case MergeCol.MERGE_AVG: {
double aDouble = ByteUtil.getDouble(bs);
long s2 = Long.parseLong(new String(bs2));
Double vale = aDouble / s2;
return vale.toString().getBytes();
}
default:
return null;
}
}
//判断分组数据是否已经存在
private boolean sameGropuColums(RowDataPacket newRow, RowDataPacket existRow) {
if (groupColumnIndexs == null) {// select count(*) from aaa , or group
// column
return true;
}
for (int i = 0; i < groupColumnIndexs.length; i++) {
if (!Arrays.equals(newRow.fieldValues.get(groupColumnIndexs[i]),
existRow.fieldValues.get(groupColumnIndexs[i]))) {
return false;
}
}
return true;
}
}
数据汇聚
group by实际上也需要对数据进行排序,与order by相比group by多了排序后数据分组,在分组同时也可以使用一下聚合函数count、sum、max、min、avg。在MySql中group by实现有三种实现方式:第一种利用松散索(Loose)引扫描实现,第二种使用紧凑(Tight)索引扫描实现,第三种使用临时表实现。当所有分片返回已经汇聚好的数据时Mycat对返回所有数据进行合并汇聚再返回到客户端,执行流程如下图:
8.4.2.1 相关类说明
MergeCol: 聚合方法
ColMeta:聚合方法以及聚合类型
RowDataPacketGrouper:数据汇聚类
8.4.2.2 代码解析
以下是数据汇聚调用入口以及数据返回
public class DataMergeService implements Runnable {
public void onRowMetaData(Map<String, ColMeta> columToIndx, int fieldCount) {
if (rrs.isHasAggrColumn()) {
List<MergeCol> mergCols = new LinkedList<MergeCol>();
Map<String, Integer> mergeColsMap = rrs.getMergeCols();
if (mergeColsMap != null) {
for (Map.Entry<String, Integer> mergEntry : mergeColsMap
.entrySet()) {
String colName = mergEntry.getKey().toUpperCase();
int type = mergEntry.getValue();
if (MergeCol.MERGE_AVG == type) {
ColMeta sumColMeta = columToIndx.get(colName + "SUM");
ColMeta countColMeta = columToIndx.get(colName
+ "COUNT");
if (sumColMeta != null && countColMeta != null) {
ColMeta colMeta = new ColMeta(sumColMeta.colIndex,
countColMeta.colIndex,
sumColMeta.getColType());
mergCols.add(new MergeCol(colMeta, mergEntry
.getValue()));
}
} else {
ColMeta colMeta = columToIndx.get(colName);
mergCols.add(new MergeCol(colMeta, mergEntry.getValue()));
}
}
}
// add no alias merg column
for (Map.Entry<String, ColMeta> fieldEntry : columToIndx.entrySet()) {
String colName = fieldEntry.getKey();
int result = MergeCol.tryParseAggCol(colName);
if (result != MergeCol.MERGE_UNSUPPORT
&& result != MergeCol.MERGE_NOMERGE) {
mergCols.add(new MergeCol(fieldEntry.getValue(), result));
}
}
grouper = new RowDataPacketGrouper(groupColumnIndexs,
mergCols.toArray(new MergeCol[mergCols.size()]),
rrs.getHavingCols());
}
}
/**
* return merged data
*
* @return (最多i*(offset+size)行数据)
*/
private List<RowDataPacket> getResults(byte[] eof) {
List<RowDataPacket> tmpResult = result;
if (this.grouper != null) {
tmpResult = grouper.getResult();
grouper = null;
}
if (sorter != null) {
// 处理grouper处理后的数据
if (tmpResult != null) {
Iterator<RowDataPacket> itor = tmpResult.iterator();
while (itor.hasNext()) {
sorter.addRow(itor.next());
itor.remove();
}
}
tmpResult = sorter.getSortedResult();
sorter = null;
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("prepare mpp merge result for " + rrs.getStatement());
}
return tmpResult;
}
}
数据汇聚代码实现
public class RowDataPacketGrouper {
private List<RowDataPacket> result = Collections.synchronizedList(new ArrayList<RowDataPacket>());//汇聚结果集合
private final MergeCol[] mergCols;
private final int[] groupColumnIndexs;
private boolean isMergAvg=false;
private HavingCols havingCols;
public RowDataPacketGrouper(int[] groupColumnIndexs, MergeCol[] mergCols,HavingCols havingCols) {
super();
this.groupColumnIndexs = groupColumnIndexs;
this.mergCols = mergCols;
this.havingCols = havingCols;
}
public List<RowDataPacket> getResult() {
//是否采用聚类方法
if(!isMergAvg)
{
for (RowDataPacket row : result)
{
//对数据做聚合
mergAvg(row);
}
isMergAvg=true;
}
//过滤having条件数据
if(havingCols != null){
filterHaving();
}
return result;
}
//合并相同分组数据
public void addRow(RowDataPacket rowDataPkg) {
for (RowDataPacket row : result) {
if (sameGropuColums(rowDataPkg, row)) {
aggregateRow(row, rowDataPkg);
return;
}
}
// not aggreated ,insert new
result.add(rowDataPkg);
}
private void aggregateRow(RowDataPacket toRow, RowDataPacket newRow) {
if (mergCols == null) {
return;
}
for (MergeCol merg : mergCols) {
if(merg.mergeType!=MergeCol.MERGE_AVG)
{
byte[] result = mertFields(
toRow.fieldValues.get(merg.colMeta.colIndex),
newRow.fieldValues.get(merg.colMeta.colIndex),
merg.colMeta.colType, merg.mergeType);
if (result != null)
{
toRow.fieldValues.set(merg.colMeta.colIndex, result);
}
}
}
}
private void mergAvg(RowDataPacket toRow) {
if (mergCols == null) {
return;
}
for (MergeCol merg : mergCols) {
if(merg.mergeType==MergeCol.MERGE_AVG)
{
byte[] result = mertFields(
toRow.fieldValues.get(merg.colMeta.avgSumIndex),
toRow.fieldValues.get(merg.colMeta.avgCountIndex),
merg.colMeta.colType, merg.mergeType);
if (result != null)
{
toRow.fieldValues.set(merg.colMeta.avgSumIndex, result);
toRow.fieldValues.remove(merg.colMeta.avgCountIndex) ;
toRow.fieldCount=toRow.fieldCount-1;
}
}
}
}
//合并分组列数据
private byte[] mertFields(byte[] bs, byte[] bs2, int colType, int mergeType) {
if(bs2==null || bs2.length==0)
{
return bs;
}else if(bs==null || bs.length==0)
{
return bs2;
}
switch (mergeType) {
//sum聚类方法
case MergeCol.MERGE_SUM:
if (colType == ColMeta.COL_TYPE_NEWDECIMAL
|| colType == ColMeta.COL_TYPE_DOUBLE
|| colType == ColMeta.COL_TYPE_FLOAT
|| colType == ColMeta.COL_TYPE_DECIMAL) {
Double vale = ByteUtil.getDouble(bs) + ByteUtil.getDouble(bs2);
return vale.toString().getBytes();
}
//count聚类方法
case MergeCol.MERGE_COUNT: {
long s1 = Long.parseLong(new String(bs));
long s2 = Long.parseLong(new String(bs2));
long total = s1 + s2;
return LongUtil.toBytes(total);
}
//max聚类方法
case MergeCol.MERGE_MAX: {
int compare = ByteUtil.compareNumberByte(bs, bs2);
return (compare > 0) ? bs : bs2;
}
//min聚类方法
case MergeCol.MERGE_MIN: {
int compare = ByteUtil.compareNumberByte(bs, bs2);
return (compare > 0) ? bs2 : bs;
}
//avg聚类方法
case MergeCol.MERGE_AVG: {
double aDouble = ByteUtil.getDouble(bs);
long s2 = Long.parseLong(new String(bs2));
Double vale = aDouble / s2;
return vale.toString().getBytes();
}
default:
return null;
}
}
//判断分组数据是否已经存在
private boolean sameGropuColums(RowDataPacket newRow, RowDataPacket existRow) {
if (groupColumnIndexs == null) {// select count(*) from aaa , or group
// column
return true;
}
for (int i = 0; i < groupColumnIndexs.length; i++) {
if (!Arrays.equals(newRow.fieldValues.get(groupColumnIndexs[i]),
existRow.fieldValues.get(groupColumnIndexs[i]))) {
return false;
}
}
return true;
}
}