一、安装及其官方使用示例
安装hivemall
$ git clone https://github.com/apache/incubator-hivemall.git
$ cd incubator-hivemall
$ bin/build.sh
启动hive和导入相关 jar
add jar /home/hadoop/incubator-hivemall/target/hivemall-core-0.4.2-rc.2-with-dependencies.jar;
source /home/hadoop/incubator-hivemall/resources/ddl/define-all.hive;
create database twitter;
use twitter;
建立外部表及设定外部数据存放位置
CREATE EXTERNAL TABLE timeseries (
num INT,
value DOUBLE
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
STORED AS TEXTFILE
LOCATION '/dataset/twitter/timeseries';
源数据格式
182.478
176.231
183.917
177.798
165.469
181.878
184.502
183.303
177.578
171.641
导入数据格式
1#182.478
2#176.231
3#183.917
4#177.798
5#165.469
6#181.878
7#184.502
8#183.303
9#177.578
10#171.641
数据上传hdfs
hadoop fs -put twitter.t /dataset/twitter/timeseries
使用SST
SELECT
num,
sst(value, "-threshold 0.005") AS result
FROM
timeseries
ORDER BY num ASC;
结果示例:
7551 {"changepoint_score":0.00453049288071683,"is_changepoint":false}
7552 {"changepoint_score":0.004711244102524104,"is_changepoint":false}
7553 {"changepoint_score":0.004814871928978115,"is_changepoint":false}
7554 {"changepoint_score":0.004968089640799422,"is_changepoint":false}
7555 {"changepoint_score":0.005709056330104878,"is_changepoint":true}
7556 {"changepoint_score":0.0044279766655132,"is_changepoint":false}
7557 {"changepoint_score":0.0034694956722586268,"is_changepoint":false}
7558 {"changepoint_score":0.002549056569322694,"is_changepoint":false}
7559 {"changepoint_score":0.0017395109108403473,"is_changepoint":false}
7560 {"changepoint_score":0.0010629833145070489,"is_changepoint":false}
Outlier and Change-Point Detection using ChangeFinder
SELECT
num,
changefinder(value, "-outlier_threshold 0.03 -changepoint_threshold 0.0035") AS result
FROM
timeseries
ORDER BY num ASC;
结果示例:
16{"outlier_score":0.051287243859365894,"changepoint_score":0.003292139657059704,"is_outlier":true,"is_changepoint":false}
17{"outlier_score":0.03994335565212781,"changepoint_score":0.003484242549446824,"is_outlier":true,"is_changepoint":false}
18{"outlier_score":0.9153515196592132,"changepoint_score":0.0036439645550477373,"is_outlier":true,"is_changepoint":true}
19{"outlier_score":0.03940593403992665,"changepoint_score":0.0035825157392152134,"is_outlier":true,"is_changepoint":true}
20{"outlier_score":0.27172093630215555,"changepoint_score":0.003542822324886785,"is_outlier":true,"is_changepoint":true}
21{"outlier_score":0.006784031454620809,"changepoint_score":0.0035029441620275975,"is_outlier":false,"is_changepoint":true}
二、参照示例基于顺丰数据的时间序列数据异常检测
日期 平台 平台访问次数 序号 outlier_score changepoint_score is_outlier is_changepoint
2016/9/13 weixin 163770 82 0.517939405 0.002799403 TRUE FALSE
2016/8/13 weixin 163770 206 0.971553367 0.002563691 TRUE FALSE
2016/7/13 weixin 163770 329 0.978151518 0.002569553 TRUE FALSE
2016/6/12 weixin 163770 453 0.893766225 0.005063597 TRUE TRUE
2016/5/12 weixin 163770 579 0.846256467 0.183253975 TRUE TRUE
2016/4/11 weixin 163770 704 1.125480999 2.552122451 TRUE TRUE
2016/3/11 weixin 163770 828 0.799214319 1.437753369 TRUE TRUE
2016/2/9 weixin 163770 949 0.768004277 2.03710669 TRUE TRUE
2016/1/9 weixin 163770 1075 1.019095702 3.674865513 TRUE TRUE
2015/12/9 weixin 163770 1197 0.726433576 1.354028448 TRUE TRUE
2015/11/8 weixin 163770 1321 0.740644184 2.317692273 TRUE TRUE
2015/10/8 weixin 163770 1447 0.961904857 3.609401953 TRUE TRUE
2015/9/7 weixin 163770 1572 0.917379836 2.44891755 TRUE TRUE
筛选出 is_outlier=true
挑选出weixin平台,发现2016和2015部分数据的平台访问次数相同
查看原始16年和15年顺丰数据
2016/01、2016/03、2016/05、2016/07、2016/08、2015/10、2015/12数据大小相等。
初步断定数据重复
通过对其下子文件大小比较,基本断定上述日期数据为同一个数据
himall官方手册:http://hivemall.incubator.apache.org/userguide/index.html
github工程地址: https://github.com/apache/incubator-hivemall