对apriori关联关系算法研究了一段时间,网上能搜到的例子,大部分是python写的,数据集长得像下面这样:
[[I1,I2,I5],[I2,I4],[I2,I3],[I1,I2,I4],[I1,I3],[I2,I3],[I1,I3],[I1,I2,I3,I5],[I1,I2,I3]]
而实际的交易数据,一般存储到关系型数据库中,数据是按下面的样子保存:
复制代码
TRAN_SEQ_NO,ITEM
1,I1
1,I2
1,I5
2,I2
2,I4
3,I2
3,I3
4,I1
4,I2
4,I4
5,I1
5,I3
6,I2
6,I3
7,I1
7,I3
8,I1
8,I2
8,I3
8,I5
9,I1
9,I2
9,I3
复制代码
而且python的程序,写了好多循环,效率不高。
根据小票数据在数据库中存储的特点,并且apriori算法也不是特别复杂,因此想用plsql实现一下。
plsql实现的aprioir算法,对原算法做了裁剪,只计算2项集和两个商品之间的关联关系,3项集以上的忽略不计。
表结构创建
1.小票表(交易事物表)
create table CMX_APRIORI_TRANSACTION
(
tran_seq_no NUMBER(20), --交易号
item VARCHAR2(25) --商品编码
)
2.频繁项集C1
create table CMX_APRIORI_L1
(
item VARCHAR2(25), --商品编码
support NUMBER, --支持度
cnt NUMBER --交易次数
)
2.关联关系L2(结果表)
create table CMX_APRIORI_L2
(
item_a VARCHAR2(25), --前件商品编码
item_b VARCHAR2(25), --后件商品编码
cnt NUMBER, --交易次数
support NUMBER, --支持度
conf_a_b NUMBER(12,6), --置信度
lift_a_b NUMBER(12,6) --提升度
)
说明:
小票表插入的数据,必须提前进行处理。同一张小票商品要去重。
完整plsql代码:
复制代码
CREATE OR REPLACE PACKAGE CMX_APRIORI_SQL IS
/*-----------------------------------------------------------------------
* PROCEDURE NAME : CMX_APRIORI_SQL
* COMMENTS : 商品关联关系计算
* CODED BY : ONELANG 2019-1-27
* CHANGED HISTORY :
-----------------------------------------------------------------------*/
FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN;
END CMX_APRIORI_SQL;
/
CREATE OR REPLACE PACKAGE BODY CMX_APRIORI_SQL IS
/*-----------------------------------------------------------------------
* PROCEDURE NAME : INIT_TEST_DATA
* COMMENTS : 初始化测试数据
* CODED BY : ONELANG 2019-1-27
* CHANGED HISTORY :
-----------------------------------------------------------------------*/
FUNCTION INIT_TEST_DATA(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS
L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.INIT_TEST_DATA';
BEGIN
EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_TRANSACTION';
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (1,'I1');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (1,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (1,'I5');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (2,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (2,'I4');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (3,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (3,'I3');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (4,'I1');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (4,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (4,'I4');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (5,'I1');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (5,'I3');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (6,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (6,'I3');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (7,'I1');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (7,'I3');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (8,'I1');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (8,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (8,'I3');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (8,'I5');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (9,'I1');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (9,'I2');
INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
VALUES (9,'I3');
COMMIT;
RETURN TRUE;
EXCEPTION
WHEN OTHERS THEN
O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
RETURN FALSE;
END;
/*-----------------------------------------------------------------------
* PROCEDURE NAME : GET_TEST_L1
* COMMENTS : 计算频繁项集L1
* CODED BY : ONELANG 2019-1-27
* CHANGED HISTORY :
-----------------------------------------------------------------------*/
FUNCTION GET_TEST_L1(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS
L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_TEST_L1';
L_TOTAL NUMBER(8);
BEGIN
EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L1';
SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL
FROM CMX_APRIORI_TRANSACTION ;
INSERT INTO CMX_APRIORI_L1
SELECT ITEM,ROUND(COUNT(1) / L_TOTAL,6) SUPPORT,COUNT(1) CNT
FROM CMX_APRIORI_TRANSACTION A
GROUP BY ITEM
HAVING COUNT(1) / L_TOTAL >= I_SUPPORT ;
COMMIT;
RETURN TRUE;
EXCEPTION
WHEN OTHERS THEN
O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
RETURN FALSE;
END;
/*-----------------------------------------------------------------------
* PROCEDURE NAME : GET_L2
* COMMENTS : 获取关联关系L2
* CODED BY :
* CHANGED HISTORY :
-----------------------------------------------------------------------*/
FUNCTION GET_TEST_L2(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS
L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_L2';
L_TOTAL NUMBER(8);
BEGIN
EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L2';
SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL
FROM CMX_APRIORI_TRANSACTION ;
--2项集支持度
INSERT INTO CMX_APRIORI_L2(ITEM_A,ITEM_B,CNT,SUPPORT)
SELECT ITEM_A,ITEM_B,COUNT(DISTINCT TRAN_SEQ_NO) CNT, ROUND(COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL,6) SUPPORT
FROM (SELECT A.TRAN_SEQ_NO,
A.ITEM ITEM_A,
B.ITEM ITEM_B
FROM CMX_APRIORI_TRANSACTION A,
CMX_APRIORI_TRANSACTION B
WHERE A.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1)
AND B.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1)
AND A.TRAN_SEQ_NO = B.TRAN_SEQ_NO
AND A.ITEM > B.ITEM
)
GROUP BY ITEM_A,ITEM_B
HAVING COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL >= I_SUPPORT;
--置信度
UPDATE CMX_APRIORI_L2 L2
SET CONF_A_B = (SELECT L2.SUPPORT / L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_A = L1.ITEM);
--提升度
UPDATE CMX_APRIORI_L2 L2
SET LIFT_A_B = (SELECT L2.CONF_A_B / L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_B = L1.ITEM);
COMMIT;
RETURN TRUE;
EXCEPTION
WHEN OTHERS THEN
O_ERROR_MESSAGE :www.dasheng178.com= SQLERRM||TO_CHAR(SQLCODE);
RETURN FALSE;
END;
FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS
L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.TEST';
L_MIN_SUPP NUMBER(9,5);
BEGIN
L_MIN_SUPP := 0.1;
IF INIT_TEST_DATA(O_ERROR_MESSAGE) = FALSE THEN
RETURN FALSE;
END IF;
IF GET_TEST_L1(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN
RETURN FALSE;
END IF;
IF GET_TEST_L2(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN
RETURN FALSE;
END IF;
RETURN TRUE;
EXCEPTION
WHEN OTHERS THEN
O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
RETURN FALSE;
END;
END CMX_APRIORI_SQL;
/
复制代码
运行:
复制代码
declare
result boolean;
begin
-- Call the function
result := cmx_apriori_sql.test(www.tiaotiaoylzc.com/_error_message => :o_error_message);
-- Convert false/true/null to 0/1/null
:result := sys.diutil.bool_to_int(result);
end;
复制代码
运行结果:
复制代码
select * from cmx_apriori_l2
ITEM_A ITEM_B CNT SUPPORT CONF_A_B LIFT_A_B
1 I2 I1 4 0.444444 0.571428 0.857142
2 I4 I2 2 0.222222 1.000000 1.285714
3 I5 I3 1 0.111111 0.500000 0.750000
4 I3 I2 4 0.444444 0.666666 0.857142
5 I5 I1 2 0.222222 1.000000 1.499999
6 I3 I1 4 0.444444 0.666666 0.999999
7 I4 I1 1 0.111111 0.500000 0.750000
8 I5 I2 2 0.222222 1.000000 1.285714
复制代码
用一家门店,一年的销售数据计算一下就会发现,尿布->啤酒根本没有关联关系。意外+惊喜。