java实现关联分析算法Apriori

package com.dataming.association;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

import org.apache.log4j.Logger;

public class Apriori {

 private static final Logger log = Logger.getLogger(Apriori.class);
 
 private int min_sport = 2;
 
 private List<String> items; //这里面的内容一定要按照顺序存放
 private List<List<Integer>> bitVectorList = new ArrayList<List<Integer>>();
 private List<CFCon> candidateList = new ArrayList<CFCon>();
 private List<CFCon> freqenceList = new ArrayList<CFCon>();
 
 public static void main(String args[]){
  Apriori apriori = new Apriori();
  
  apriori.generateData();
  apriori.apriMain();
  apriori.printFreqItems();
  
 }
 
 private void printFreqItems(){
  CFCon cfcL = freqenceList.get(freqenceList.size() - 1);
  for(CF cf : cfcL.cfList){
   String kk = "";
   List<String> itemList = cf.itemList;
   for(int i = 0; i < itemList.size(); i++){
    if(i == 0){
     kk = itemList.get(i);
    } else {
     kk += "," + itemList.get(i);
    }
   }
   log.info("freqence: " + kk + " supCount:" + cf.supCount);
  }
 }
 
 private void apriMain(){
  //C1
  CFCon cfcC1 = find_frequent_1_itemsets();
  
  candidateList.add(cfcC1);
  CFCon cfcL1 = candidateToFreqent(cfcC1);
  freqenceList.add(cfcL1);
  
  CFCon cfcL = cfcL1;
  
  HashSet<String> set = new HashSet<String>();
  for(int k = 2; cfcL != null && cfcL.cfList != null && cfcL.cfList.size() > 0; k++){
   CFCon cfcCk = getCandateFroFreq(cfcL);
   //为cfcC计数
   for(List<Integer> bitVector : bitVectorList){
    set.clear();
    for(int i = 0; i < items.size(); i++){
     int bit = bitVector.get(i);
     if(bit == 1){
      set.add(items.get(i));
     }
    }
    List<CF> cfList = cfcCk.cfList;
    for(CF cf : cfList){
     List<String> itemList = cf.itemList;
     boolean isAdd = true;
     for(String item : itemList){
      if(!set.contains(item)){
       isAdd = false;
       break;
      }
     }
     if(isAdd)cf.supCount++;
    }
   }
   
   cfcL = candidateToFreqent(cfcCk);
   if(cfcCk.cfList != null && cfcCk.cfList.size() > 0)candidateList.add(cfcCk);
   if(cfcL.cfList != null && cfcL.cfList.size() > 0)freqenceList.add(cfcL);
  }
 }
 
 /**
  * 从L(k-1) 生成 C(k);
  *
  * @param cfc
  * @return
  */
 private CFCon getCandateFroFreq(CFCon cfcL){
  CFCon cfcC = null;
  
  if(cfcL != null){
   cfcC = new CFCon(1, cfcL.iteratNum + 1);
   List<CF> cfList = cfcL.cfList;
   for(int outIndex = 0; outIndex < cfList.size(); outIndex++){
    CF cfOut = cfList.get(outIndex);
    List<String> itemOutList = cfOut.itemList;
    for(int inIndex = outIndex + 1; inIndex < cfList.size(); inIndex++){
     if(outIndex == inIndex) continue;
     
     CF cfIn = cfList.get(inIndex);
     List<String> itemInList = cfIn.itemList;
     
     List<String> itemList = new ArrayList<String>();
     
     boolean same = true;
     for(int index = 0; index < itemOutList.size() - 1; index++){
      String out = itemOutList.get(index);
      String in = itemInList.get(index);
      if(out == null || in == null || !out.equals(in)){
       same = false;
       break;
      }
      itemList.add(out);
     }
     if(same){
      String out = itemOutList.get(itemOutList.size() - 1 );
      String in = itemInList.get(itemInList.size() - 1);
      if(out != null && in != null && !out.equals(in)){
       if(out.compareTo(in) >= 0){
        itemList.add(in);
        itemList.add(out);
       } else {
        itemList.add(out);
        itemList.add(in);
       }
       CF cf = new CF(itemList, 0);
       if(!has_infreqent_subset(itemList, cfcL)){
        cfcC.cfList.add(cf);
       }
      }
     }
    }
   }
  }
  
  return cfcC;
 } 
 
 /**
  * 在L(k-1)查找是否存在,cList(k-1)子集
  *
  * @param cList
  * @param cfc L(k-1)
  * @return
  */
 private boolean has_infreqent_subset(List<String> cList, CFCon cfc){
  HashSet<String> set = new HashSet<String>();
  
  List<CF> cfList = cfc.cfList;
  for(int index = 0; index < cfList.size(); index++){
   CF cf = cfList.get(index);
   List<String> itemList = cf.itemList;
   String key = "";
   boolean first = true;
   for(String item : itemList){
    if(first){
     first = false;
     key = item;
    } else {
     key += "," + item;
    }
   }
   set.add(key);
  }
  
  StringBuilder sb = new StringBuilder();  
  for(int index = 0; index < cList.size(); index++){
   
   sb.delete(0, sb.length());
   boolean first = true;
   for(int index2 = 0; index2 < cList.size(); index2++){
    if(index2 == index)continue;
    else {
     if(first){
      sb.append(cList.get(index2));
      first = false;
     } else {
      sb.append(",");
      sb.append(cList.get(index2));
     }
    }
   }
   boolean setCon = set.contains(sb.toString());
   if(!setCon) return true;
  }
  
  return false;
 }
 
 private class CFCon {
  
  List<CF> cfList;
  int cOrf;  //1.候选集,2,频繁集
  int iteratNum; //迭代次数
  
  public CFCon(int cOrf, int iteratNum){
   cfList = new ArrayList<CF>();
   this.cOrf = cOrf;
   this.iteratNum = iteratNum;
  }
  
  public CFCon(int n, int cOrf, int iteratNum){
   this.cOrf = cOrf;
   this.iteratNum = iteratNum;
   cfList = new ArrayList<CF>();
   for(int index = 0; index < n; index++){
    List<String> itemList = new ArrayList<String>();
    itemList.add(items.get(index));
    
    CF cf = new CF(itemList, 0);
    
    cfList.add(cf);
   }
  }
 }
 
 private class CF {
  List<String> itemList;
  int supCount;
  
  public CF(List<String> itemList, int supCount){
   this.itemList = itemList;
   this.supCount = supCount;
  }
 }
 
 private CFCon find_frequent_1_itemsets(){
  
  CFCon cfc = null;
  if(bitVectorList != null && items != null){
   cfc = new CFCon(items.size(), 1, 1);
   for(List<Integer> bitVector : bitVectorList){
    if(bitVector != null){
     for(int index = 0; index < bitVector.size(); index++){
      int bit = bitVector.get(index);
      CF cf = cfc.cfList.get(index);
      if(bit == 1) cf.supCount++;
     }
    }
   }
  }
  return cfc;
 }
 
 /**
  * 通过min_suport过滤掉最小的
  *
  * @param cfcC
  * @return
  */
 private CFCon candidateToFreqent(CFCon cfcC){
  List<CF> cfList = cfcC.cfList;
  
  CFCon cfcL = new CFCon(2, cfcC.iteratNum);
  if(cfList != null){
   for(int index = cfList.size() - 1; index >= 0; index--){
    CF cf = cfList.get(index);
    int supCount = cf.supCount;
    if(supCount >= min_sport){
     cfcL.cfList.add(cf);
    }
   }
  }
  return cfcL;
 }
 
 private void generateData(){
  items = new ArrayList<String>();
  for(int index = 1; index <=5; index++) items.add("I" + index);
  
  bitVectorList.add(getStrList("1,1,0,0,1"));
  bitVectorList.add(getStrList("0,1,0,1,0"));
  bitVectorList.add(getStrList("0,1,1,0,0"));
  bitVectorList.add(getStrList("1,1,0,1,0"));
  bitVectorList.add(getStrList("1,0,1,0,0"));
  bitVectorList.add(getStrList("0,1,1,0,0"));
  bitVectorList.add(getStrList("1,0,1,0,0"));
  bitVectorList.add(getStrList("1,1,1,0,1"));
  bitVectorList.add(getStrList("1,1,1,0,0"));
  
 }
 
 private List<Integer> getStrList(String bitVector){
  List<Integer> list = new ArrayList<Integer>();
  if(bitVector != null){
   String[] bitArr = bitVector.split(",");
   for(String bit : bitArr){
    list.add(Integer.parseInt(bit));
   }
  }
  return list;
 }
}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
好的,让我们来一步一步实现Apriori算法。 首先,我们需要读取数据集并将其转换为合适的数据结构。假设我们有一个包含交易数据的文件,每行表示一次交易,其中商品以逗号分隔。我们可以使用Python中的csv模块读取文件并将其转换为列表形式。代码如下: ```python import csv transactions = [] with open('data.csv', 'r') as f: reader = csv.reader(f) for row in reader: transactions.append(row) ``` 接下来,我们需要统计每个项的出现次数,并删除出现次数低于某个阈值的项。这里我们可以使用Python中的Counter和set数据结构。代码如下: ```python from collections import Counter # 统计每个项的出现次数 item_counts = Counter() for transaction in transactions: for item in transaction: item_counts[item] += 1 # 删除出现次数低于阈值的项 min_support = 0.5 items = set(item for item, count in item_counts.items() if count / len(transactions) >= min_support) ``` 然后,我们需要生成候选项集。假设我们要生成长度为2的候选项集,我们可以使用Python中的itertools模块的combinations函数。代码如下: ```python import itertools # 生成长度为2的候选项集 candidate_itemsets = set(itertools.combinations(items, 2)) ``` 接下来,我们需要扫描数据集,统计候选项集的出现次数,并删除出现次数低于某个阈值的候选项集。代码如下: ```python # 统计候选项集的出现次数 itemset_counts = Counter() for transaction in transactions: for itemset in candidate_itemsets: if set(itemset).issubset(set(transaction)): itemset_counts[itemset] += 1 # 删除出现次数低于阈值的候选项集 min_support = 0.5 frequent_itemsets = set(itemset for itemset, count in itemset_counts.items() if count / len(transactions) >= min_support) ``` 现在我们得到了长度为2的频繁项集。我们可以使用相同的方法生成更高维度的候选项集,并重复以上步骤,直到不再有频繁项集产生为止。代码如下: ```python # 生成更高维度的候选项集 k = 3 while True: # 生成长度为k的候选项集 candidate_itemsets = set( itertools.combinations(frequent_itemsets, k)) # 统计候选项集的出现次数 itemset_counts = Counter() for transaction in transactions: for itemset in candidate_itemsets: if set(itemset).issubset(set(transaction)): itemset_counts[itemset] += 1 # 删除出现次数低于阈值的候选项集 min_support = 0.5 frequent_itemsets = set(itemset for itemset, count in itemset_counts.items() if count / len(transactions) >= min_support) # 如果不再有频繁项集产生,则退出循环 if not frequent_itemsets: break k += 1 ``` 最后,我们可以使用频繁项集生成关联规则,并计算它们的置信度和支持度。代码如下: ```python # 生成关联规则 rules = [] for itemset in frequent_itemsets: for i in range(1, len(itemset)): left = itemset[:i] right = itemset[i:] support = itemset_counts[itemset] / len(transactions) confidence = itemset_counts[itemset] / item_counts[left] rules.append((left, right, support, confidence)) # 打印关联规则 for left, right, support, confidence in rules: print(f'{left} => {right} (support: {support}, confidence: {confidence})') ``` 这就是Apriori算法实现过程。当然,实际应用中还需要考虑很多细节,例如如何对候选项集进行剪枝、如何高效地计算频繁项集等等。如果您对此感兴趣,可以进一步学习相关知识。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值