=========补充了关联规则的生成======== 比想象的要麻烦一点
关联规则可以是双向的,confidence(A-->B)=P(A|B)=support(A&B)/support(A)
所以在计算k项集的关联规则时,其分母都是k项集的支持度,分子为k-1项集的支持度,以及对应1项集的支持度
001 | /** |
002 | * |
003 | */ |
004 | package org.waitingfortime.datamining.association; |
005 |
006 | import java.io.BufferedReader; |
007 | import java.io.File; |
008 | import java.io.FileNotFoundException; |
009 | import java.io.FileOutputStream; |
010 | import java.io.FileReader; |
011 | import java.io.IOException; |
012 | import java.io.PrintStream; |
013 | import java.util.ArrayList; |
014 | import java.util.HashMap; |
015 | import java.util.Iterator; |
016 | import java.util.List; |
017 | import java.util.Map; |
018 | import java.util.Set; |
019 | import java.util.TreeSet; |
020 |
021 | /** |
022 | * @author mazhiyuan |
023 | * |
024 | */ |
025 | public class Apriori { |
026 | private int minNum; // 最小支持数 |
027 | private double minCon; // 最小置信度 |
028 | private List<Set<Integer>> records; // 原始数据 |
029 | private String output; // 输出路径 |
030 | private List<List<ItemSet>> result = new ArrayList<List<ItemSet>>(); // 频繁项集结果 |
031 | private List<ItemSet> fth; // 频繁1项集 |
032 |
033 | public Apriori( double minDegree, double minCon, String input, String output) { |
034 | this .output = output; |
035 | this .minCon = minCon; |
036 | init(input); |
037 | if (records.size() == 0 ) { |
038 | System.err.println( "不符合计算条件。退出!" ); |
039 | System.exit( 1 ); |
040 | } |
041 | minNum = ( int ) (minDegree * records.size()); |
042 | } |
043 |
044 | private void init(String path) { |
045 | // TODO Auto-generated method stub |
046 | records = new ArrayList<Set<Integer>>(); |
047 | try { |
048 | BufferedReader br = new BufferedReader( new FileReader( |
049 | new File(path))); |
050 |
051 | String line = null ; |
052 | Set<Integer> record; |
053 | while ((line = br.readLine()) != null ) { |
054 | if (! "" .equals(line.trim())) { |
055 | record = new TreeSet<Integer>(); |
056 | String[] items = line.split( " " ); |
057 | for (String item : items) { |
058 | record.add(Integer.valueOf(item)); |
059 | } |
060 | records.add(record); |
061 | } |
062 | } |
063 |
064 | br.close(); |
065 | } catch (IOException e) { |
066 | System.err.println( "读取事务文件失败。" ); |
067 | } |
068 | } |
069 |
070 | private void first() { |
071 | // TODO Auto-generated method stub |
072 | fth = new ArrayList<ItemSet>(); |
073 | Map<Integer, Integer> first = new HashMap<Integer, Integer>(); |
074 | for (Set<Integer> si : records) |
075 | for (Integer i : si) { |
076 | if (first.get(i) == null ) |
077 | first.put(i, 1 ); |
078 | else |
079 | first.put(i, first.get(i) + 1 ); |
080 | } |
081 |
082 | for (Integer i : first.keySet()) |
083 | if (first.get(i) >= minNum) |
084 | fth.add( new ItemSet(i, first.get(i))); |
085 |
086 | } |
087 |
088 | private void loop(List<ItemSet> items) { |
089 | // TODO Auto-generated method stub |
090 | List<ItemSet> copy = new ArrayList<ItemSet>(items); |
091 | List<ItemSet> res = new ArrayList<ItemSet>(); |
092 | int size = items.size(); |
093 |
094 | // 连接 |
095 | for ( int i = 0 ; i < size; i++) |
096 | for ( int j = i + 1 ; j < size; j++) |
097 | if (copy.get(i).isMerge(copy.get(j))) { |
098 | ItemSet is = new ItemSet(copy.get(i)); |
099 | is.merge(copy.get(j).item.last()); |
100 | res.add(is); |
101 | } |
102 | // 剪枝 |
103 | pruning(copy, res); |
104 |
105 | if (res.size() != 0 ) { |
106 | result.add(res); |
107 | loop(res); |
108 | } |
109 | } |
110 |
111 | private void pruning(List<ItemSet> pre, List<ItemSet> res) { |
112 | // TODO Auto-generated method stub |
113 | // step 1 k项集的子集属于k-1项集 |
114 | Iterator<ItemSet> ir = res.iterator(); |
115 | while (ir.hasNext()) { |
116 | // 获取所有k-1项子集 |
117 | ItemSet now = ir.next(); |
118 | Map<Integer, List<Integer>> ss = subSet(now); |
119 | // 判断是否在pre集中 |
120 | boolean flag = false ; |
121 | for (List<Integer> li : ss.values()) { |
122 | if (flag) |
123 | break ; |
124 | for (ItemSet pis : pre) { |
125 | if (pis.item.containsAll(li)) { |
126 | flag = false ; |
127 | break ; |
128 | } |
129 | flag = true ; |
130 | } |
131 | } |
132 | if (flag) { |
133 | ir.remove(); |
134 | continue ; |
135 | } |
136 | // step 2 支持度 |
137 | int i = 0 ; |
138 | for (Set<Integer> sr : records) { |
139 | if (sr.containsAll(now.item)) |
140 | i++; |
141 |
142 | now.support = i; |
143 | } |
144 | if (now.support < minNum) { |
145 | ir.remove(); |
146 | continue ; |
147 | } |
148 | // 产生关联规则 |
149 | double deno = now.support; |
150 | for (Map.Entry<Integer, List<Integer>> me : ss.entrySet()) { |
151 | ItemCon ic = new ItemCon(me.getKey(), me.getValue()); |
152 | int nume = 0 ; |
153 |
154 | for (ItemSet f : fth) |
155 | if (f.item.contains(me.getKey())) { |
156 | nume = f.support; |
157 | break ; |
158 | } |
159 | if (deno / nume > minCon) { |
160 | now.calcon(ic); |
161 | ic.setC1(deno / nume); |
162 | } |
163 | for (ItemSet pis : pre) |
164 | if (pis.item.size() == me.getValue().size() |
165 | && pis.item.containsAll(me.getValue())) { |
166 | nume = pis.support; |
167 | break ; |
168 | } |
169 | if (deno / nume > minCon) |
170 | ic.setC2(deno / nume); |
171 | } |
172 | } |
173 | } |
174 |
175 | private Map<Integer, List<Integer>> subSet(ItemSet is) { |
176 | // TODO Auto-generated method stub |
177 | List<Integer> li = new ArrayList<Integer>(is.item); |
178 | Map<Integer, List<Integer>> res = new HashMap<Integer, List<Integer>>(); |
179 | for ( int i = 0 , j = li.size(); i < j; i++) { |
180 | List<Integer> _li = new ArrayList<Integer>(li); |
181 | _li.remove(i); |
182 | res.put(li.get(i), _li); |
183 | } |
184 | return res; |
185 | } |
186 |
187 | private void output() throws FileNotFoundException { |
188 | if (result.size() == 0 ) { |
189 | System.err.println( "无结果集。退出!" ); |
190 | return ; |
191 | } |
192 | FileOutputStream out = new FileOutputStream(output); |
193 | PrintStream ps = new PrintStream(out); |
194 | for (List<ItemSet> li : result) { |
195 | ps.println( "=============频繁" + li.get( 0 ).item.size() |
196 | + "项集=============" ); |
197 | for (ItemSet is : li) { |
198 | ps.println(is.item + " : " + is.support); |
199 | ps.println(); |
200 | if (is.ics.size() != 0 ) { |
201 | ps.println( "******关联规则******" ); |
202 | for (ItemCon ic : is.ics) { |
203 | ps.println(ic.i + " ---> " + ic.li + " con: " |
204 | + ic.confidence1); |
205 | if (ic.confidence2 > minCon) |
206 | ps.println(ic.li + " ---> " + ic.i + " con: " |
207 | + ic.confidence2); |
208 | } |
209 | ps.println( "******************" ); |
210 | ps.println(); |
211 | } |
212 | } |
213 | ps.println( "=====================================" ); |
214 | } |
215 |
216 | ps.close(); |
217 | } |
218 |
219 | /** |
220 | * @param args |
221 | * @throws FileNotFoundException |
222 | */ |
223 | public static void main(String[] args) throws FileNotFoundException { |
224 | // TODO Auto-generated method stub |
225 | long begin = System.currentTimeMillis(); |
226 | Apriori apriori = new Apriori( 0.25 , 0.5 , |
227 | "/home/mazhiyuan/code/mushroom.dat" , |
228 | "/home/mazhiyuan/code/mout.data" ); |
229 | // apriori.first();//频繁1项集 |
230 | apriori.first(); |
231 | apriori.loop(apriori.fth); |
232 |
233 | apriori.output(); |
234 | System.out.println( "共耗时:" + ((System.currentTimeMillis()) - begin) |
235 | + "ms" ); |
236 | } |
237 | } |
238 |
239 | class ItemSet { |
240 | TreeSet<Integer> item; |
241 | int support; |
242 | List<ItemCon> ics = new ArrayList<ItemCon>(); // 关联规则结果 |
243 |
244 | ItemSet(ItemSet is) { |
245 | this .item = new TreeSet<Integer>(is.item); |
246 | } |
247 |
248 | ItemSet() { |
249 | item = new TreeSet<Integer>(); |
250 | } |
251 |
252 | ItemSet( int i, int v) { |
253 | this (); |
254 | merge(i); |
255 | setValue(v); |
256 | } |
257 |
258 | void setValue( int i) { |
259 | this .support = i; |
260 | } |
261 |
262 | void merge( int i) { |
263 | item.add(i); |
264 | } |
265 |
266 | void calcon(ItemCon ic) { |
267 | ics.add(ic); |
268 | } |
269 |
270 | boolean isMerge(ItemSet other) { |
271 | if (other == null || other.item.size() != item.size()) |
272 | return false ; |
273 | // 前k-1项相同,最后一项不同,满足连接条件 |
274 | /* |
275 | * Iterator<Integer> i = item.headSet(item.last()).iterator(); |
276 | * Iterator<Integer> o = |
277 | * other.item.headSet(other.item.last()).iterator(); while (i.hasNext() |
278 | * && o.hasNext()) if (i.next() != o.next()) return false; |
279 | */ |
280 | Iterator<Integer> i = item.iterator(); |
281 | Iterator<Integer> o = other.item.iterator(); |
282 | int n = item.size(); |
283 | while (i.hasNext() && o.hasNext() && --n > 0 ) |
284 | if (i.next() != o.next()) |
285 | return false ; |
286 |
287 | return !(item.last() == other.item.last()); |
288 | } |
289 | } |
290 |
291 | class ItemCon { |
292 | Integer i; |
293 | List<Integer> li; |
294 | double confidence1; |
295 | double confidence2; |
296 |
297 | ItemCon(Integer i, List<Integer> li) { |
298 | this .i = i; |
299 | this .li = li; |
300 | } |
301 |
302 | void setC1( double c1) { |
303 | this .confidence1 = c1; |
304 | } |
305 |
306 | void setC2( double c2) { |
307 | this .confidence2 = c2; |
308 | } |
309 | } |
Apriori算法本身的性能就是一大问题,产生太多的候选集,FP-TREE算法规避了这一问题,使得频繁项集的挖掘性能提高了至少一个量级,下一篇重点介绍这个算法。