不多说直接上代码:
package cn.nononononono
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
/**
* @作者:Alpha.SK.LXY
* @时间:2022/6/2 14:09
* @描述:文本处理工具集
*/
object StrDealTool {
/**
* 向量的模长
*
* @param vec
*/
def module(vec: Vector[Double]): Double = {
// math.sqrt( vec.map(x=>x*x).sum )
math.sqrt(vec.map(math.pow(_, 2)).sum)
}
/**
* 求两个向量的内积
*
* @param v1
* @param v2
*/
def innerProduct(v1: Vector[Double], v2: Vector[Double]): Double = {
val listBuffer = ListBuffer[Double]()
for (i <- 0 until v1.length; j <- 0 until v2.length; if i == j) {
if (i == j) {
listBuffer.append(v1(i) * v2(j))
}
}
listBuffer.sum
}
/**
* 求两个向量的余弦值
*
* @param v1
* @param v2
*/
def cosvec(v1: Vector[Double], v2: Vector[Double]): Double = {
val cos = innerProduct(v1, v2) / (module(v1) * module(v2))
if (cos <= 1) cos else 1.0
}
def textCosine(str1: String, str2: String): Double = {
val set = mutable.Set[Char]() //统计两句话所有的字
str1.foreach(set += _)
str2.foreach(set += _)
println(set)
val ints1: Vector[Double] = set.toList.sorted.map(ch => {
str1.count(s => s == ch).toDouble
}).toVector
println("===ints1: " + ints1)
val ints2: Vector[Double] = set.toList.sorted.map(ch => {
str2.count(s => s == ch).toDouble
}).toVector
println("===ints2: " + ints2)
cosvec(ints1, ints2)
}
// 测试
def main(args: Array[String]): Unit = {
val str1 = "跟我读黑化肥发灰会挥发"
val str2 = "跟我读黑化肥发灰会挥发"
val result = textCosine(str1, str2)
println("两句话的余弦距离: " + result)
val str11 = "跟我读黑化肥发灰会挥发"
val str22 = "发挥会灰发肥化黑读我跟"
val result2 = textCosine(str11, str22)
println("两句话的余弦距离(倒序): " + result2)
val str111 = "跟我读黑化肥发灰会挥发"
val str222 = "如果黑化肥发灰发会不会挥发"
val result222 = textCosine(str111, str222)
println("两句话的余弦距离(文本不一致): " + result222)
}
}
结果:
Nice!
有问题评论交流或者关注小程序《数据之巅》