http://www.cnblogs.com/gnool/p/5883209.html
K近邻应用-异常检测应用
原理:
根据数据样本进行KMeans机器学习模型的建立,获取簇心点,以簇为单位,离簇心最远的第五个点的距离为阈值,大于这个值的为异常点,即获得数据异常。
如图:
数据样本:
1
|
1,2.43,2.3899999 2,2.38,2.12 3,2.8,2.51 4,2.01,2.69 5,2.71,2.45 6,2.55,2.34 <br>7,2.46,2.31 8,2.27,2.38 9,2.87,2.55 10,2.75,2.07 11,2.3899999,2.6100001 12,2.67,2.31 <br>13,2.68,2.75 14,2.47,2.05 15,2.96,2.66 16,2.08,2.92 17,2.58,2.12 18,2.69,2.72 <br>19,2.29,2.81 20,2.2,2.2 21,2.46,2.87 22,2.66,2.92 23,2.71,2.63 24,2.09,2.99 <br>25,2.33,2.84 26,2.4,2.63 27,2.05,2.27 28,2.59,2.81 29,2.68,2.72 30,2.5,2.29 <br>31,2.63,2.8899999 32,2.35,2.8600001 33,2.74,2.06 34,2.83,2.56 35,2.3600001,2.87 36,2.25,2.32 <br>37,2.99,2.85 38,2.19,2.62 39,2.37,2.19 40,2.37,2.08 41,2.62,2.25 42,2.16,2.56 <br>43,2.08,2.37 44,2.77,2.55 45,2.96,2.85 46,2.52,2.24 47,2.6,2.55 48,2.78,2.14 <br>49,2.76,2.42 50,2.05,2.67 51,2.94,2.82 52,2.52,2.59 53,2.04,2.08 54,2.65,2.03 <br>55,2.32,2.88 56,2.96,2.2 57,2.97,2.28 58,2.01,2.6399999 59,2.58,2.52 60,2.55,2.7 <br>61,2.75,2.19 62,2.28,2.48 63,2.6399999,2.54 64,2.34,2.27 65,2.72,2.23 66,2.5,2.35 <br>67,2.25,2.2 68,2.27,2.91 69,2.8899999,2.88 70,2.76,2.48 71,2.63,2.22 72,2.69,2.33 <br>73,2.9,2.02 74,2.23,2.26 75,2.82,2.87 76,2.57,2.83 77,2.97,2.47 78,2.69,2.54 <br>79,2.6,2.84 80,2.98,2.99 81,2.21,2.3899999 82,2.11,2.46 83,2.54,2.77 84,2.57,2.19 <br>85,2.66,2.77 86,2.4,2.88 87,2.43,2.75 88,2.35,2.05 89,2.68,2.25 90,2.43,2.87 <br>91,2.06,2.05 92,2.8600001,2.6100001 93,2.58,2.75 94,2.91,2.8 95,2.38,2.95 96,2.63,2.58 <br>97,2.82,2.93 98,2.72,2.97 99,2.16,2.55 100,5.46,5.1 101,5.9,5.39 102,5.81,5.91 <br>103,5.92,5.65 104,5.91,5.94 105,5.9,5.91 106,5.7799997,5.66 107,5.76,5.32 108,5.11,5.77 <br>109,5.38,5.46 110,5.63,5.76 111,5.1,5.7200003 112,5.66,5.31 113,5.86,5.6 114,5.46,5.74 <br>115,5.76,5.17 116,5.39,5.24 117,5.33,5.49 118,5.05,5.28 119,5.8,5.63 120,5.0,5.18 <br>121,5.35,5.71 122,5.5299997,5.45 123,5.95,5.04 124,5.17,5.32 125,5.83,5.56 126,5.67,5.55 <br>127,5.63,5.25 128,5.42,5.27 129,5.38,5.57 130,5.39,5.6 131,5.88,5.41 132,5.84,5.38 <br>133,5.95,5.36 134,5.65,5.43 135,5.76,5.05 136,5.65,5.5 137,5.13,5.07 138,5.79,5.87 <br>139,5.87,5.38 140,5.9,5.96 141,5.28,5.05 142,5.8,5.61 143,5.24,5.24 144,5.08,5.35 <br>145,5.38,5.5299997 146,5.4,5.62 147,5.73,5.0 148,5.3,5.1 149,5.34,5.39 150,5.63,5.34 <br>151,5.4,5.29 152,5.23,5.26 153,5.04,5.25 154,5.49,5.83 155,5.89,5.18 156,5.18,5.85 <br>157,5.41,5.67 158,5.81,5.7200003 159,5.62,5.41 160,5.79,5.5 161,5.35,5.94 162,5.31,5.68 <br>163,5.14,5.74 164,5.37,5.59 165,5.19,5.91 166,5.62,5.64 167,5.26,5.38 168,5.74,5.91 <br>169,5.17,5.8 170,5.68,5.13 171,5.67,5.21 172,5.2,5.49 173,5.89,5.87 174,5.8,5.22 <br>175,5.01,5.31 176,5.0,5.28 177,5.95,5.56 178,5.27,5.23 179,5.9,5.74 180,5.21,5.75 <br>181,5.13,5.3 182,5.36,5.0 183,5.21,5.86 184,5.21,5.56 185,5.7799997,5.15 186,5.04,5.4 <br>187,5.52,5.61 188,5.02,5.99 189,5.32,5.04 190,5.81,5.51 191,5.76,5.29 192,5.03,5.62 <br>193,5.08,5.26 194,5.42,5.4 195,5.28,5.04 196,5.2,5.49 197,5.7799997,5.33 198,5.38,5.71 <br>199,5.9700003,5.96 200,8.51,8.93 201,8.43,8.58 202,8.62,8.31 203,8.08,8.52 204,8.31,8.49 <br>205,8.4,8.97 206,8.6,8.74 207,8.96,8.76 208,8.0,8.79 209,8.04,8.0 210,8.71,8.23 <br>211,8.78,8.4 212,8.85,8.34 213,8.04,8.74 214,8.92,8.55 215,8.0,8.9 216,8.24,8.45 <br>217,8.33,8.35 218,8.83,8.94 219,8.23,8.06 220,8.46,8.85 221,8.39,8.59 222,8.7,8.85 <br>223,8.45,8.68 224,8.86,8.74 225,8.11,8.18 226,8.11,8.27 227,8.15,8.35 228,8.99,8.27 <br>229,8.67,8.12 230,8.18,8.92 231,8.58,8.58 232,8.05,8.67 233,8.97,8.11 234,8.76,8.49 <br>235,8.18,8.54 236,8.82,8.64 237,8.74,8.89 238,8.82,8.77 239,8.02,8.33 240,8.77,8.54 <br>241,8.22,8.13 242,8.92,8.35 243,8.71,8.55 244,8.12,8.74 245,8.07,8.96 246,8.71,8.17 <br>247,8.12,8.4 248,8.03,8.92 249,8.99,8.55 250,8.63,8.19 251,8.95,8.82 252,8.25,8.32 <br>253,8.08,8.21 254,8.31,8.94 255,8.87,8.3 256,8.72,8.23 257,8.98,8.88 258,8.48,8.64 <br>259,8.81,8.3 260,8.15,8.07 261,8.36,8.02 262,8.16,8.22 263,8.77,8.44 264,8.51,8.17 <br>265,8.28,8.31 266,8.57,8.47 267,8.95,8.1 268,8.91,8.72 269,8.34,8.64 270,8.07,8.99 <br>271,8.3,8.75 272,8.35,8.75 273,8.9,8.22 274,8.99,8.94 275,8.67,8.37 276,8.27,8.0 <br>277,8.68,8.93 278,8.18,8.45 279,8.25,8.82 280,8.99,8.17 281,8.36,8.17 282,8.64,8.38 <br>283,8.94,8.77 284,8.33,8.71 285,8.23,8.81 286,8.56,8.79 287,8.71,8.89 288,8.09,8.27 <br>289,8.93,8.0 290,8.66,8.23 291,8.35,8.1 292,8.15,8.54 293,8.72,8.03 294,8.64,8.76 <br>295,8.94,8.28 296,8.39,8.87 297,8.01,8.4 298,8.07,8.28 299,8.12,8.65 300,8.65,8.16
|
数据样本的数据格式为:标号,特征值1,特征值2(没有具体含义,自动生成的数据只为能够简单的说明异常检测是怎么一回事,以及机器学习到底是如何应用在实际生产环境中)
可视化展示:
我们将数据样本投射到可视化环境中的可以看到数据呈现以下图形:
数据被分为3簇,在我们训练模型是K值为3簇。由于数据非常集中,数据量也非常少,同时特征向量为二维特征向量,故投影成平面图形我们一眼可以看出数据分为几簇,当样本数据的特征值很多时,就得靠计算得出K值(这里先不提)
应用代码实践:
//获取样本数据
val rawData = sc.textFile("D:/logdata/kmeans.txt")
//将样本数据转化为模型可操作的向量集
val labelAndData = rawData.map { line => val buffer = line.split(',').toBuffer val label = buffer.remove(0) val vector = Vectors.dense(buffer.map(_.toDouble).toArray) (label, vector) } //将样本数据向量集缓存 val data = labelAndData.values.cache() //建立Kmeans学习模型 val kmeans = new KMeans() kmeans.setK(3) //训练数据 val model = kmeans.run(data) //打印簇心点 model.clusterCenters.foreach(println) //欧氏距离的计算函数 def distance(a: Vector, b: Vector): Double = { math.sqrt(a.toArray.zip(b.toArray).map(p => p._1 - p._2).map(d => d * d).sum) } //计算向量到模型簇心点的距离 def distToCentroid(datum: Vector, model: KMeansModel) = { val cluster = model.predict(datum) val centroid = model.clusterCenters(cluster) distance(centroid, datum) } //计算所有点到簇心点的距离集合 val distances = data.map(datum => distToCentroid(datum, model) ) //获取最大的第五个值为阈值 val threshold = distances.top(5).last //测试数据获取 val testRawData = sc.textFile("D:/logdata/kmeans") val testLabelAndData = testRawData.map { line => val buffer = line.split(',').toBuffer val label = buffer.remove(0) val vector = Vectors.dense(buffer.map(_.toDouble).toArray) (label, vector) } //将测试数据集缓存 val testData = testLabelAndData.values.cache() //异常数据集过滤并打印结果 val anomalies=testData.filter { x => distToCentroid(x, model) > threshold }.collect().foreach(println)
计算结果:
[5.525200003000001,5.494100009000001]
[2.522222221212122,2.512020205050505]
[8.483267326732673,8.49178217821782]
异常值:
[6.73,6.58]
[6.62,6.04]
[6.99,6.66]
[6.59,6.38]
[6.42,6.74]
[6.37,6.59]
[6.84,6.03]
[6.84,6.03]
[6.9700003,6.5299997]
[6.03,6.31]
[6.18,6.27]
[6.84,6.81]
[6.3,6.93]
[6.49,6.23]
[6.16,6.67]
[6.56,6.77]
[6.57,6.32]
[6.37,6.55]
[6.68,6.07]
[6.8,6.4]
[6.91,6.44]