package com.dataming.association;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import org.apache.log4j.Logger;
public class Apriori {
private static final Logger log = Logger.getLogger(Apriori.class); private int min_sport = 2; private List items; //这里面的内容一定要按照顺序存放 private List> bitVectorList = new ArrayList>(); private List candidateList = new ArrayList(); private List freqenceList = new ArrayList(); public static void main(String args[]){ Apriori apriori = new Apriori(); apriori.generateData(); apriori.apriMain(); apriori.printFreqItems(); } private void printFreqItems(){ CFCon cfcL = freqenceList.get(freqenceList.size() - 1); for(CF cf : cfcL.cfList){ String kk = ""; List itemList = cf.itemList; for(int i = 0; i < itemList.size(); i++){ if(i == 0){ kk = itemList.get(i); } else { kk += "," + itemList.get(i); } } log.info("freqence: " + kk + " supCount:" + cf.supCount); } } private void apriMain(){ //C1 CFCon cfcC1 = find_frequent_1_itemsets(); candidateList.add(cfcC1); CFCon cfcL1 = candidateToFreqent(cfcC1); freqenceList.add(cfcL1); CFCon cfcL = cfcL1; HashSet set = new HashSet(); for(int k = 2; cfcL != null && cfcL.cfList != null && cfcL.cfList.size() > 0; k++){ CFCon cfcCk = getCandateFroFreq(cfcL); //为cfcC计数 for(List bitVector : bitVectorList){ set.clear(); for(int i = 0; i < items.size(); i++){ int bit = bitVector.get(i); if(bit == 1){ set.add(items.get(i)); } } List cfList = cfcCk.cfList; for(CF cf : cfList){ List itemList = cf.itemList; boolean isAdd = true; for(String item : itemList){ if(!set.contains(item)){ isAdd = false; break; } } if(isAdd)cf.supCount++; } } cfcL = candidateToFreqent(cfcCk); if(cfcCk.cfList != null && cfcCk.cfList.size() > 0)candidateList.add(cfcCk); if(cfcL.cfList != null && cfcL.cfList.size() > 0)freqenceList.add(cfcL); } } /** * 从L(k-1) 生成 C(k); * * @param cfc * @return */ private CFCon getCandateFroFreq(CFCon cfcL){ CFCon cfcC = null; if(cfcL != null){ cfcC = new CFCon(1, cfcL.iteratNum + 1); List cfList = cfcL.cfList; for(int outIndex = 0; outIndex < cfList.size(); outIndex++){ CF cfOut = cfList.get(outIndex); List itemOutList = cfOut.itemList; for(int inIndex = outIndex + 1; inIndex < cfList.size(); inIndex++){ if(outIndex == inIndex) continue; CF cfIn = cfList.get(inIndex); List itemInList = cfIn.itemList; List itemList = new ArrayList(); boolean same = true; for(int index = 0; index < itemOutList.size() - 1; index++){ String out = itemOutList.get(index); String in = itemInList.get(index); if(out == null || in == null || !out.equals(in)){ same = false; break; } itemList.add(out); } if(same){ String out = itemOutList.get(itemOutList.size() - 1 ); String in = itemInList.get(itemInList.size() - 1); if(out != null && in != null && !out.equals(in)){ if(out.compareTo(in) >= 0){ itemList.add(in); itemList.add(out); } else { itemList.add(out); itemList.add(in); } CF cf = new CF(itemList, 0); if(!has_infreqent_subset(itemList, cfcL)){ cfcC.cfList.add(cf); } } } } } } return cfcC; } /** * 在L(k-1)查找是否存在,cList(k-1)子集 * * @param cList * @param cfc L(k-1) * @return */ private boolean has_infreqent_subset(List cList, CFCon cfc){ HashSet set = new HashSet(); List cfList = cfc.cfList; for(int index = 0; index < cfList.size(); index++){ CF cf = cfList.get(index); List itemList = cf.itemList; String key = ""; boolean first = true; for(String item : itemList){ if(first){ first = false; key = item; } else { key += "," + item; } } set.add(key); } StringBuilder sb = new StringBuilder(); for(int index = 0; index < cList.size(); index++){ sb.delete(0, sb.length()); boolean first = true; for(int index2 = 0; index2 < cList.size(); index2++){ if(index2 == index)continue; else { if(first){ sb.append(cList.get(index2)); first = false; } else { sb.append(","); sb.append(cList.get(index2)); } } } boolean setCon = set.contains(sb.toString()); if(!setCon) return true; } return false; } private class CFCon { List cfList; int cOrf; //1.候选集,2,频繁集 int iteratNum; //迭代次数 public CFCon(int cOrf, int iteratNum){ cfList = new ArrayList(); this.cOrf = cOrf; this.iteratNum = iteratNum; } public CFCon(int n, int cOrf, int iteratNum){ this.cOrf = cOrf; this.iteratNum = iteratNum; cfList = new ArrayList(); for(int index = 0; index < n; index++){ List itemList = new ArrayList(); itemList.add(items.get(index)); CF cf = new CF(itemList, 0); cfList.add(cf); } } } private class CF { List itemList; int supCount; public CF(List itemList, int supCount){ this.itemList = itemList; this.supCount = supCount; } } private CFCon find_frequent_1_itemsets(){ CFCon cfc = null; if(bitVectorList != null && items != null){ cfc = new CFCon(items.size(), 1, 1); for(List bitVector : bitVectorList){ if(bitVector != null){ for(int index = 0; index < bitVector.size(); index++){ int bit = bitVector.get(index); CF cf = cfc.cfList.get(index); if(bit == 1) cf.supCount++; } } } } return cfc; } /** * 通过min_suport过滤掉最小的 * * @param cfcC * @return */ private CFCon candidateToFreqent(CFCon cfcC){ List cfList = cfcC.cfList; CFCon cfcL = new CFCon(2, cfcC.iteratNum); if(cfList != null){ for(int index = cfList.size() - 1; index >= 0; index--){ CF cf = cfList.get(index); int supCount = cf.supCount; if(supCount >= min_sport){ cfcL.cfList.add(cf); } } } return cfcL; } private void generateData(){ items = new ArrayList(); for(int index = 1; index <=5; index++) items.add("I" + index); bitVectorList.add(getStrList("1,1,0,0,1")); bitVectorList.add(getStrList("0,1,0,1,0")); bitVectorList.add(getStrList("0,1,1,0,0")); bitVectorList.add(getStrList("1,1,0,1,0")); bitVectorList.add(getStrList("1,0,1,0,0")); bitVectorList.add(getStrList("0,1,1,0,0")); bitVectorList.add(getStrList("1,0,1,0,0")); bitVectorList.add(getStrList("1,1,1,0,1")); bitVectorList.add(getStrList("1,1,1,0,0")); } private List getStrList(String bitVector){ List list = new ArrayList(); if(bitVector != null){ String[] bitArr = bitVector.split(","); for(String bit : bitArr){ list.add(Integer.parseInt(bit)); } } return list; }}