1. scala 余弦算法
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
class test{
// def main(args: Array[String]): Unit = {
// val str1 = "听说菠萝就是凤梨"
// val str2 = "西瓜肯定不会是凤梨"
//
// val result=textCosine(str1,str2)
// println("两句话的余弦距离: "+result)
//
// }
/**
* 向量的模长
* @param vec
*/
def module(vec:Vector[Double]): Double ={
// math.sqrt( vec.map(x=>x*x).sum )
math.sqrt(vec.map(math.pow(_,2)).sum)
}
/**
* 求两个向量的内积
* @param v1
* @param v2
*/
def innerProduct(v1:Vector[Double],v2:Vector[Double]): Double ={
val listBuffer=ListBuffer[Double]()
for(i<- 0 until v1.length; j<- 0 until v2.length;if i==j){
if(i==j){
listBuffer.append( v1(i)*v2(j) )
}
}
listBuffer.sum
}
/**
* 求两个向量的余弦值
* @param v1
* @param v2
*/
def cosvec(v1:Vector[Double],v2:Vector[Double]):Double ={
val cos=innerProduct(v1,v2) / (module(v1)* module(v2))
if (cos <= 1) cos else 1.0
}
def textCosine(str1:String,str2:String):Double={
val set=mutable.Set[Char]() //统计两句话所有的字
str1.foreach(set +=_)
str2.foreach(set +=_)
// println(set)
val ints1: Vector[Double] = set.toList.sorted.map(ch => {
str1.count(s => s == ch).toDouble
}).toVector
// println("===ints1: "+ints1)
val ints2: Vector[Double] = set.toList.sorted.map(ch => {
str2.count(s => s == ch).toDouble
}).toVector
// println("===ints2: "+ints2)
cosvec(ints1,ints2)
}
}
2. UDF
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* Auther: Jerry
* Date: 2022/08/04
* Description:
*/
public class MyUDF extends UDF{
public Double evaluate(String str1,String str2){
Double res = 1.0;
test t = new test();
res = t.textCosine(str1, str2);
return res;
}
// public static void main(String[] args) {
// Double evaluate = new MyUDF().evaluate("听说菠萝就是凤梨", "西瓜肯定不会是凤梨");
// System.out.println(evaluate);
// }
}
3. 打包上传
4. 创建udf
# 添加临时方法
hive: add jar hdfs://master:9001/root/my.jar;
hive:create temporary function myudf as "类路径";
hive: droop function if exists myudf;
# 添加永久方法 (永久方法是按照数据库来添加的,哪个库需要就加给哪个库)
hive:create function my_db.myudf as "类路径" using jar "hdfs:///path/to/jar";