import org.apache.spark.mllib.linalg.SparseVector
import scala.util.Random
/**
* @param hashNum 签名向量的维度, hash函数的个数
*/
class MinHash(hashNum: Int) extends Serializable {
val HASH_PRIME=2038074743
val rand = new Random()
/**
* n个随机哈希函数的参数配置
*/
val randCoefs: Array[(Int, Int)] = Array.fill(hashNum) {
(1 + rand.nextInt(HASH_PRIME - 1), rand.nextInt(HASH_PRIME - 1))
}
def generateSignature(vector: SparseVector): Array[Int] = {
val indexes = vector.indices
val signatureVector = randCoefs.map {
case (a, b) =>
indexes.map(index => ((1 + index) * a + b) % HASH_PRIME).min
}
signatureVector
}
}