呵呵,前几天拿到了数据挖掘基础教程一书,感觉部分算法是基于统计学的原理的,而统计学是可以通过Oracle来实现。
关于数据挖掘关联规则的介绍,可以参见:http://baike.baidu.com/view/1076817.htm?fr=ala0_1
关联规则是形如X→Y的蕴涵式,
其中且, X和Y分别称为关联规则的先导(antecedent或left-hand-side, LHS)和后继(consequent或 right-hand-side, RHS) 。
关联规则在D中的支持度(support)是D中事务同时包含X、Y的百分比,即概率;=X^Y/D
置信度(confidence)是包含X的事务中同时又包含Y的百分比,即条件概率。 =(X^Y)/X
关联规则是有趣的,如果满足最小支持度阈值和最小置信度阈值。
若给定最小支持度α = n,最小置信度β = m,则分别通过以上的X^Y/D和(X^Y)/X,可获知是否存在关联
使用的原始数据
反范式后的数据
待统计项
代码示例
--创建各个购买单元项视图
create view distinct_trans as select distinct tranobject from purchase;
--创建各个事务内部的购买单元项
--可以用wm_concat函数
create view all_trans as
SELECT tranid,MAX(tranobjects)tranobjects
FROM(SELECT tranid,WMSYS.WM_CONCAT(tranobject)OVER(PARTITION BY tranid ORDER BY tranobject)tranobjects
FROM purchase)
GROUP BY tranid;
--也可以用sys_connect_by_path函数
create view all_trans as
select tranid,substr(tranobjects,2)tranobjects
from--格式化前面的逗号和空格
(select distinct tranid,FIRST_VALUE(tranobjects)OVER(PARTITION BY tranid ORDER BY levels desc)AS tranobjects--保留最大的那个
from
(select tranid,sys_connect_by_path(tranobject,',')tranobjects,level levels--各购买事务的内部排列组合
from purchase
connect by tranid=prior tranid and tranobject
)
);
--对所有购买单元项进行排列组合,即数据挖掘的X^Y项
create view all_zuhe as
select substr(sys_connect_by_path(tranobject,','),2)zuhe
from(select distinct tranobject from purchase)
connect by nocycle tranobject
select*from all_zuhe
--筛选出符合要求的排列组合,即数据挖掘的X项和Y项
create view full_zuhe as
select a.zuhe X,b.zuhe Y from all_zuhe a,all_zuhe b
where instr(a.zuhe,b.zuhe)=0and instr(b.zuhe,a.zuhe)=0
and not exists(select1from distinct_trans c
where instr(a.zuhe,c.tranobject)>0and instr(b.zuhe,c.tranobject)>0)
select*from full_zuhe
create or replace view tongji as
select xy,xy_total,x,x_total,y,y_total,transtotal
from
(select y||','||x xy,
(select count(*)from all_trans a where instr(a.tranobjects,c.x||','||c.y)>0or instr(a.tranobjects,c.y||','||c.x)>0)xy_total,--包含xy的事务数
y,
(select count(*)from all_trans b where instr(b.tranobjects,c.y)>0)y_total,--包含y的事务数
x,
(select count(*)from all_trans b where instr(b.tranobjects,c.x)>0)x_total,--包含x的事务数
d.transtotal--总事务数
from full_zuhe c,(select count(distinct tranid)transtotal from purchase)d
order by xy_total desc,x_total desc
)
select*from tongji where xy_total>=3and y_total>=3