读代码-MinHashDriver及相关

用到:泛型类 counter 哈希实现

package org.apache.mahout.clustering.minhash;
public final class MinHashDriver extends AbstractJob

输入Sequence格式
输出根据debug模式可选向量和文本格式,文件可以Sequence和Text格式


Class<? extends Writable> outputClass =
debugOutput ? VectorWritable.class : Text.class;
Class<? extends OutputFormat> outputFormatClass =
debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class;

job.setMapperClass(MinHashMapper.class);
job.setReducerClass(MinHashReducer.class);

job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(outputFormatClass);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(outputClass);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(outputClass);



package org.apache.mahout.clustering.minhash;
public class MinHashMapper extends Mapper<Text,VectorWritable,Text,Writable>

setup函数中
根据类型和数量生成hash函数组

hashFunction = HashFactory.createHashFunctions(hashType, numHashFunctions);


map函数
对于每个哈希函数
将函数应用于item的每个feature,将feature值转成4字节,取最小的哈希值

for (int i = 0; i < numHashFunctions; i++) {
for (Vector.Element ele : featureVector) {
int value = (int) ele.get();
bytesToHash[0] = (byte) (value >> 24);
bytesToHash[1] = (byte) (value >> 16);
bytesToHash[2] = (byte) (value >> 8);
bytesToHash[3] = (byte) value;
int hashIndex = hashFunction[i].hash(bytesToHash);
if (minHashValues[i] > hashIndex) {
minHashValues[i] = hashIndex;
}
}
}


聚类id的组合和分发
keyGroups控制id的组成段数
格式为XXX-XXX-XXX格式
每个item对每个哈希函数都分发一次


for (int i = 0; i < numHashFunctions; i++) {
StringBuilder clusterIdBuilder = new StringBuilder();
for (int j = 0; j < keyGroups; j++) {
clusterIdBuilder.append(minHashValues[(i + j) % numHashFunctions]).append('-');
}
String clusterId = clusterIdBuilder.toString();
clusterId = clusterId.substring(0, clusterId.lastIndexOf('-'));
Text cluster = new Text(clusterId);
Writable point;
if (debugOutput) {
point = new VectorWritable(featureVector.clone());
} else {
point = new Text(item.toString());
}
context.write(cluster, point);
}


package org.apache.mahout.clustering.minhash;
public class MinHashReducer extends Reducer<Text,Writable,Text,Writable>

reduce函数
根据debug类型解析不同类型

Collection<Writable> pointList = new ArrayList<Writable>();
for (Writable point : points) {
if (debugOutput) {
Vector pointVector = ((VectorWritable) point).get().clone();
Writable writablePointVector = new VectorWritable(pointVector);
pointList.add(writablePointVector);
} else {
Writable pointText = new Text(point.toString());
pointList.add(pointText);
}
}


counter使用,参数是enum类型
数量小于最小聚类数的被舍弃掉

if (pointList.size() >= minClusterSize) {
context.getCounter(Clusters.ACCEPTED).increment(1);
for (Writable point : pointList) {
context.write(cluster, point);
}
} else {
context.getCounter(Clusters.DISCARDED).increment(1);
}



package org.apache.mahout.clustering.minhash;
public final class HashFactory
哈希的实现

三种哈希类型

public enum HashType {
LINEAR, POLYNOMIAL, MURMUR
}


孪生素数,相差为2的两个数都是质数的情况
整型范围内最大孪生素数的较小值
RandomUtils.MAX_INT_SMALLER_TWIN_PRIME = 2147482949
哈希用素数取模冲突小

线性哈希

static class LinearHash implements HashFunction {
private final int seedA;
private final int seedB;

LinearHash(int seedA, int seedB) {
this.seedA = seedA;
this.seedB = seedB;
}

@Override
public int hash(byte[] bytes) {
long hashValue = 31;
for (long byteVal : bytes) {
hashValue *= seedA * byteVal;
hashValue += seedB;
}
return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
}
}


多项式哈希

static class PolynomialHash implements HashFunction {
private final int seedA;
private final int seedB;
private final int seedC;

PolynomialHash(int seedA, int seedB, int seedC) {
this.seedA = seedA;
this.seedB = seedB;
this.seedC = seedC;
}

@Override
public int hash(byte[] bytes) {
long hashValue = 31;
for (long byteVal : bytes) {
hashValue *= seedA * (byteVal >> 4);
hashValue += seedB * byteVal + seedC;
}
return Math
.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
}
}


MurMur哈希

static class MurmurHashWrapper implements HashFunction {
private final int seed;

MurmurHashWrapper(int seed) {
this.seed = seed;
}

@Override
public int hash(byte[] bytes) {
long hashValue = MurmurHash.hash64A(bytes, seed);
return Math.abs((int) (hashValue % RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值