关联分析-Apriori算法Java实现 支持度+置信度(2)

=========补充了关联规则的生成======== 比想象的要麻烦一点

关联规则可以是双向的,confidence(A-->B)=P(A|B)=support(A&B)/support(A)

所以在计算k项集的关联规则时,其分母都是k项集的支持度,分子为k-1项集的支持度,以及对应1项集的支持度

001 /**
002  *
003  */
004 package org.waitingfortime.datamining.association;
005  
006 import java.io.BufferedReader;
007 import java.io.File;
008 import java.io.FileNotFoundException;
009 import java.io.FileOutputStream;
010 import java.io.FileReader;
011 import java.io.IOException;
012 import java.io.PrintStream;
013 import java.util.ArrayList;
014 import java.util.HashMap;
015 import java.util.Iterator;
016 import java.util.List;
017 import java.util.Map;
018 import java.util.Set;
019 import java.util.TreeSet;
020  
021 /**
022  * @author mazhiyuan
023  *
024  */
025 public class Apriori {
026     private int minNum;// 最小支持数
027     private double minCon;// 最小置信度
028     private List<Set<Integer>> records;// 原始数据
029     private String output;// 输出路径
030     private List<List<ItemSet>> result = new ArrayList<List<ItemSet>>();// 频繁项集结果
031     private List<ItemSet> fth;// 频繁1项集
032  
033     public Apriori(double minDegree, double minCon, String input, String output) {
034         this.output = output;
035         this.minCon = minCon;
036         init(input);
037         if (records.size() == 0) {
038             System.err.println("不符合计算条件。退出!");
039             System.exit(1);
040         }
041         minNum = (int) (minDegree * records.size());
042     }
043  
044     private void init(String path) {
045         // TODO Auto-generated method stub
046         records = new ArrayList<Set<Integer>>();
047         try {
048             BufferedReader br = new BufferedReader(new FileReader(
049                     new File(path)));
050  
051             String line = null;
052             Set<Integer> record;
053             while ((line = br.readLine()) != null) {
054                 if (!"".equals(line.trim())) {
055                     record = new TreeSet<Integer>();
056                     String[] items = line.split(" ");
057                     for (String item : items) {
058                         record.add(Integer.valueOf(item));
059                     }
060                     records.add(record);
061                 }
062             }
063  
064             br.close();
065         catch (IOException e) {
066             System.err.println("读取事务文件失败。");
067         }
068     }
069  
070     private void first() {
071         // TODO Auto-generated method stub
072         fth = new ArrayList<ItemSet>();
073         Map<Integer, Integer> first = new HashMap<Integer, Integer>();
074         for (Set<Integer> si : records)
075             for (Integer i : si) {
076                 if (first.get(i) == null)
077                     first.put(i, 1);
078                 else
079                     first.put(i, first.get(i) + 1);
080             }
081  
082         for (Integer i : first.keySet())
083             if (first.get(i) >= minNum)
084                 fth.add(new ItemSet(i, first.get(i)));
085  
086     }
087  
088     private void loop(List<ItemSet> items) {
089         // TODO Auto-generated method stub
090         List<ItemSet> copy = new ArrayList<ItemSet>(items);
091         List<ItemSet> res = new ArrayList<ItemSet>();
092         int size = items.size();
093  
094         // 连接
095         for (int i = 0; i < size; i++)
096             for (int j = i + 1; j < size; j++)
097                 if (copy.get(i).isMerge(copy.get(j))) {
098                     ItemSet is = new ItemSet(copy.get(i));
099                     is.merge(copy.get(j).item.last());
100                     res.add(is);
101                 }
102         // 剪枝
103         pruning(copy, res);
104  
105         if (res.size() != 0) {
106             result.add(res);
107             loop(res);
108         }
109     }
110  
111     private void pruning(List<ItemSet> pre, List<ItemSet> res) {
112         // TODO Auto-generated method stub
113         // step 1 k项集的子集属于k-1项集
114         Iterator<ItemSet> ir = res.iterator();
115         while (ir.hasNext()) {
116             // 获取所有k-1项子集
117             ItemSet now = ir.next();
118             Map<Integer, List<Integer>> ss = subSet(now);
119             // 判断是否在pre集中
120             boolean flag = false;
121             for (List<Integer> li : ss.values()) {
122                 if (flag)
123                     break;
124                 for (ItemSet pis : pre) {
125                     if (pis.item.containsAll(li)) {
126                         flag = false;
127                         break;
128                     }
129                     flag = true;
130                 }
131             }
132             if (flag) {
133                 ir.remove();
134                 continue;
135             }
136             // step 2 支持度
137             int i = 0;
138             for (Set<Integer> sr : records) {
139                 if (sr.containsAll(now.item))
140                     i++;
141  
142                 now.support = i;
143             }
144             if (now.support < minNum) {
145                 ir.remove();
146                 continue;
147             }
148             // 产生关联规则
149             double deno = now.support;
150             for (Map.Entry<Integer, List<Integer>> me : ss.entrySet()) {
151                 ItemCon ic = new ItemCon(me.getKey(), me.getValue());
152                 int nume = 0;
153  
154                 for (ItemSet f : fth)
155                     if (f.item.contains(me.getKey())) {
156                         nume = f.support;
157                         break;
158                     }
159                 if (deno / nume > minCon) {
160                     now.calcon(ic);
161                     ic.setC1(deno / nume);
162                 }
163                 for (ItemSet pis : pre)
164                     if (pis.item.size() == me.getValue().size()
165                             && pis.item.containsAll(me.getValue())) {
166                         nume = pis.support;
167                         break;
168                     }
169                 if (deno / nume > minCon)
170                     ic.setC2(deno / nume);
171             }
172         }
173     }
174  
175     private Map<Integer, List<Integer>> subSet(ItemSet is) {
176         // TODO Auto-generated method stub
177         List<Integer> li = new ArrayList<Integer>(is.item);
178         Map<Integer, List<Integer>> res = new HashMap<Integer, List<Integer>>();
179         for (int i = 0, j = li.size(); i < j; i++) {
180             List<Integer> _li = new ArrayList<Integer>(li);
181             _li.remove(i);
182             res.put(li.get(i), _li);
183         }
184         return res;
185     }
186  
187     private void output() throws FileNotFoundException {
188         if (result.size() == 0) {
189             System.err.println("无结果集。退出!");
190             return;
191         }
192         FileOutputStream out = new FileOutputStream(output);
193         PrintStream ps = new PrintStream(out);
194         for (List<ItemSet> li : result) {
195             ps.println("=============频繁" + li.get(0).item.size()
196                     "项集=============");
197             for (ItemSet is : li) {
198                 ps.println(is.item + " : " + is.support);
199                 ps.println();
200                 if (is.ics.size() != 0) {
201                     ps.println("******关联规则******");
202                     for (ItemCon ic : is.ics) {
203                         ps.println(ic.i + " ---> " + ic.li + " con: "
204                                 + ic.confidence1);
205                         if (ic.confidence2 > minCon)
206                             ps.println(ic.li + " ---> " + ic.i + " con: "
207                                     + ic.confidence2);
208                     }
209                     ps.println("******************");
210                     ps.println();
211                 }
212             }
213             ps.println("=====================================");
214         }
215  
216         ps.close();
217     }
218  
219     /**
220      * @param args
221      * @throws FileNotFoundException
222      */
223     public static void main(String[] args) throws FileNotFoundException {
224         // TODO Auto-generated method stub
225         long begin = System.currentTimeMillis();
226         Apriori apriori = new Apriori(0.250.5,
227                 "/home/mazhiyuan/code/mushroom.dat",
228                 "/home/mazhiyuan/code/mout.data");
229         // apriori.first();//频繁1项集
230         apriori.first();
231         apriori.loop(apriori.fth);
232  
233         apriori.output();
234         System.out.println("共耗时:" + ((System.currentTimeMillis()) - begin)
235                 "ms");
236     }
237 }
238  
239 class ItemSet {
240     TreeSet<Integer> item;
241     int support;
242     List<ItemCon> ics = new ArrayList<ItemCon>(); // 关联规则结果
243  
244     ItemSet(ItemSet is) {
245         this.item = new TreeSet<Integer>(is.item);
246     }
247  
248     ItemSet() {
249         item = new TreeSet<Integer>();
250     }
251  
252     ItemSet(int i, int v) {
253         this();
254         merge(i);
255         setValue(v);
256     }
257  
258     void setValue(int i) {
259         this.support = i;
260     }
261  
262     void merge(int i) {
263         item.add(i);
264     }
265  
266     void calcon(ItemCon ic) {
267         ics.add(ic);
268     }
269  
270     boolean isMerge(ItemSet other) {
271         if (other == null || other.item.size() != item.size())
272             return false;
273         // 前k-1项相同,最后一项不同,满足连接条件
274         /*
275          * Iterator<Integer> i = item.headSet(item.last()).iterator();
276          * Iterator<Integer> o =
277          * other.item.headSet(other.item.last()).iterator(); while (i.hasNext()
278          * && o.hasNext()) if (i.next() != o.next()) return false;
279          */
280         Iterator<Integer> i = item.iterator();
281         Iterator<Integer> o = other.item.iterator();
282         int n = item.size();
283         while (i.hasNext() && o.hasNext() && --n > 0)
284             if (i.next() != o.next())
285                 return false;
286  
287         return !(item.last() == other.item.last());
288     }
289 }
290  
291 class ItemCon {
292     Integer i;
293     List<Integer> li;
294     double confidence1;
295     double confidence2;
296  
297     ItemCon(Integer i, List<Integer> li) {
298         this.i = i;
299         this.li = li;
300     }
301  
302     void setC1(double c1) {
303         this.confidence1 = c1;
304     }
305  
306     void setC2(double c2) {
307         this.confidence2 = c2;
308     }
309 }

Apriori算法本身的性能就是一大问题,产生太多的候选集,FP-TREE算法规避了这一问题,使得频繁项集的挖掘性能提高了至少一个量级,下一篇重点介绍这个算法。

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值