根据Mahout FPGrowth算法生成的结果计算关联规则项

在调用 Mahout 算法包之前,我们先看下单机 使用FP-Tree算法计算关联规则;


FP-Tree算法的实现 可参考网址: http://www.cnblogs.com/zhangchaoyang/articles/2198946.html

这篇blog 详细的讲述了 FP-Tree 算法的理论实现,主要通过构造 FP-Tree树,计算后缀模式的条件模式基;通过迭代计算获取所有满足条件的关联规则项。

而调用 Mahout 算法包,我们发现频繁项结果只是第一次构建FP-Tree树产生的 后缀模式的条件模式基;显然,没法直接提取所有关联规则项。
网站资料有计算关联规则的方法:

原blog是英文版的, 查看需要翻墙:

根据上面资料代码实现:
调用Mahout算法包类
package myTesting.associate;


import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.fpm.pfpgrowth.PFPGrowth;
import org.apache.mahout.utils.SequenceFileDumper;


public class FPTreeTest {



public static void main(String[] args) throws Exception {

runFP();

printFP();


}
/**
* 调用FPTree 算法
* @throws IOException 
* @throws ClassNotFoundException 
* @throws InterruptedException 
*/
public static void runFP() throws IOException, InterruptedException, ClassNotFoundException{
//mahout fpg -i output.dat -o patterns -k 10 -method mapreduce -regex '[\ ]' -s 10

Parameters params = new Parameters();
   params.set("minSupport","3");
   params.set("maxHeapSize", "10");
   params.set("numGroups", "1000");
   params.set("treeCacheSize", "5");
   params.set("splitPattern", " ");


   String encoding = "UTF-8";
   params.set("encoding", encoding);


   params.set(PFPGrowth.USE_FPG2, "true");


   Path inputDir = new Path("hdfs://192.168.9.72:9000/space_milk.dat");
   Path outputDir = new Path("hdfs://192.168.9.72:9000/patterns");
   params.set("input", inputDir.toString());
   params.set("output", outputDir.toString());


   Configuration conf = new Configuration();
   HadoopUtil.delete(conf, outputDir);
   PFPGrowth.runPFPGrowth(params);
}
/**
* 打印FP 计算结果
* @throws Exception 
*/
public static void printFP() throws Exception{
// mahout seqdumper -i patterns/frequentpatterns/part-r-00000

SequenceFileDumper dumper = new SequenceFileDumper();
String[] args = new String[]{"-i","hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000"};
dumper.run(args);

}

}

计算关联规则类
package myTesting.associate;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;


public class ResultReader {
  public static Map<Integer, Long> readFrequency(Configuration configuration, String fileName) throws Exception {
       FileSystem fs = FileSystem.get(configuration);
       Reader frequencyReader = new SequenceFile.Reader(fs,new Path(fileName), configuration);
       Map<Integer, Long> frequency = new HashMap<Integer, Long>();
       Text key = new Text();
       LongWritable value = new LongWritable();
       
       while(frequencyReader.next(key, value)) {
           frequency.put(Integer.parseInt(key.toString()), value.get());
       }
       return frequency;
       
   }
 
 
   public static void readFrequentPatterns(
           Configuration configuration,
           String fileName,
           int transactionCount,
           Map<Integer, Long> frequency,
           double minSupport, double minConfidence) throws Exception {
       FileSystem fs = FileSystem.get(configuration);
 
       //读取FP-Tree算法结果
       Reader frequentPatternsReader = new SequenceFile.Reader(fs,
               new Path(fileName), configuration);
       Text key = new Text();
       TopKStringPatterns value = new TopKStringPatterns();
 
       while(frequentPatternsReader.next(key, value)) {
           long firstFrequencyItem = -1;
           String firstItemId = null;//后缀模式
           List<Pair<List<String>, Long>> patterns = value.getPatterns();//条件模式基
           int i = 0;
           for(Pair<List<String>, Long> pair: patterns) {
           //遍历后缀模式的每一个模式基
               List<String> itemList = pair.getFirst();//模式基
               Long occurrence = pair.getSecond();//模式基的次数
               if (i == 0) {
                   firstFrequencyItem = occurrence;
                   firstItemId = itemList.get(0);
               } else {
                   double support = (double)occurrence / transactionCount;
                   double confidence = (double)occurrence / firstFrequencyItem;
                   
                   if ((support > minSupport && confidence > minConfidence)) {
                       List<String> listWithoutFirstItem = new ArrayList<String>();
                       for(String itemId: itemList) {
                           if (!itemId.equals(firstItemId)) {
                               
                               listWithoutFirstItem.add(itemId);
                           }
                       }
                       String firstItem = firstItemId;
                       listWithoutFirstItem.remove(firstItemId);
                       System.out.printf(
                           "%s => %s: supp=%.3f, conf=%.3f",
                           listWithoutFirstItem,
                           firstItem,
                           support,
                           confidence);
 
                       if (itemList.size() == 2) {
                           // we can easily compute the lift and the conviction for set of
                           // size 2, so do it
                           int otherItemId = -1;
                           for(String itemId: itemList) {
                               if (!itemId.equals(firstItemId)) {
                                   otherItemId = Integer.parseInt(itemId);
                                   break;
                               }
                           }
                           long otherItemOccurrence = frequency.get(otherItemId);
                           double lift = (double)occurrence / (firstFrequencyItem * otherItemOccurrence);
                           double conviction = (1.0 - (double)otherItemOccurrence / transactionCount) / (1.0 - confidence);
                           System.out.printf(
                               ", lift=%.3f, conviction=%.3f",
                               lift, conviction);
                       }
                       System.out.printf("\n");
                   }
               }
               i++;
           }
       }
       frequentPatternsReader.close();
 
   }
 
   public static void main(String args[]) throws Exception {
 
       int transactionCount = 9;//事务总数
       String frequencyFilename = "hdfs://192.168.9.72:9000/patterns/fList";
       
       String frequentPatternsFilename = "hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000";
       
       double minSupport = 0.001;//支持度
       double minConfidence = 0.3;//置信度
 
 
       Configuration configuration = new Configuration();
       //读取频繁一项集,key为一项集元素,value是对应出现次数
       Map<Integer, Long> frequency = readFrequency(configuration, frequencyFilename);
       
       //计算关联规则项
       readFrequentPatterns(configuration, frequentPatternsFilename,
               transactionCount, frequency, minSupport, minConfidence);
 
   }
}

测试结果,Mahout算法计算结果数据:
Key: 1: Value: ([1],6), ([4, 1],5), ([3, 1],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4, 1],3)
Key: 2: Value: ([2],7), ([2, 4],6), ([2, 3],5), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([2, 5],3)
Key: 3: Value: ([3],7), ([3, 4],5), ([3, 1],5), ([2, 3],5), ([3, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([3, 5],3)
Key: 4: Value: ([4],7), ([2, 4],6), ([4, 1],5), ([3, 4],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3)
Key: 5: Value: ([5],4), ([3, 5],3), ([2, 5],3)

计算所得的所有关联规则项:
[4] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3, 4] => 1: supp=0.444, conf=0.667
[2, 4] => 1: supp=0.444, conf=0.667
[2, 3, 4] => 1: supp=0.333, conf=0.500
[4] => 2: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[3] => 2: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 2: supp=0.444, conf=0.571
[3, 4] => 2: supp=0.444, conf=0.571
[3, 4, 1] => 2: supp=0.333, conf=0.429
[5] => 2: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[4] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[1] => 3: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[2] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 3: supp=0.444, conf=0.571
[2, 4] => 3: supp=0.444, conf=0.571
[2, 4, 1] => 3: supp=0.333, conf=0.429
[5] => 3: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[2] => 4: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[1] => 4: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[3] => 4: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[3, 1] => 4: supp=0.444, conf=0.571
[2, 1] => 4: supp=0.444, conf=0.571
[2, 3] => 4: supp=0.444, conf=0.571
[2, 3, 1] => 4: supp=0.333, conf=0.429
[3] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889
[2] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889


分析发现,([2, 4, 1],4)是满足条件的频繁项,那么它的所有两项集子集是满足条件的频繁项;所以2=》1 或者 1=》2关联规则项应该是满足条件的;很显然根据上面的关联规则的算法没法实现。
由于我计算的只是 两个元素的之间的关联规则,所以写了一个简单的MR实现它:
public class FPTreeAssoRuleMakerJob {


private static final String MY_FREQUENT="myfrequent";

public static void makeSimpleAssoRule(Parameters params) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
   conf.set("mapred.compress.map.output", "true");
   conf.set("mapred.output.compression.type", "BLOCK");
   
   String outputPath = params.get("output");
   
   Path input = new Path(outputPath, PFPGrowth.FP_GROWTH);
   Job job = new Job(conf, "AssoRule Driver running over input: " + input);
   job.setJarByClass(FPTreeAssoRuleMakerJob.class);
   
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(TextArrayPair.class);
   
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(DoubleWritable.class);
   
   FileInputFormat.addInputPath(job, input);
   Path outPath = new Path(outputPath,MY_FREQUENT);
   FileOutputFormat.setOutputPath(job, outPath);
   
   job.setInputFormatClass(SequenceFileInputFormat.class);
   job.setMapperClass(AssoRuleMap.class);
//    job.setCombinerClass(Reduce.class);
   job.setReducerClass(AssoRuleReduce.class);
//    job.setOutputFormatClass(SequenceFileOutputFormat.class);
   
   
   HadoopUtil.delete(conf, outPath);
   boolean succeeded = job.waitForCompletion(true);
   if (!succeeded) {
     throw new IllegalStateException("Job failed!");
   }
   
//完成标记文件
FileSystem fs = FileSystem.get(URI.create(outputPath), conf);
HdfsUtils.createOverFile(fs,outputPath+Path.SEPARATOR+Constants.mark_UserGroupsAnalysis , null);
   
}


public static class AssoRuleMap extends Mapper<Text, TopKStringPatterns, Text, TextArrayPair>{


/**
* 根据FP-tree结果树,提取所有可能的频繁一项集、频繁二项集;并以一项集或者二项集中的某个元素作为key,<br>
* 对应的频繁项集作为value(value为自定义数据类型),传给reduce处理
*/
protected void map(Text key, TopKStringPatterns values, Context context) throws java.io.IOException ,InterruptedException {

System.out.println("key:"+key);
System.out.println("values:"+values);
List<Pair<List<String>,Long>> patterns = values.getPatterns();
TextArrayPair tap;
ArrayList<String> array;
for(Pair<List<String>,Long> pattern:patterns){
List<String> conditions = pattern.getFirst();
Long value = pattern.getSecond();

if(conditions.size()==1){//1项集
tap = new TextArrayPair();

tap.setFields(conditions);
tap.setCount(value);

context.write(key,tap);
}else{
//最后一个元素为后缀元素
if(conditions.get(conditions.size()-1).equals(key.toString())){
String first;
String second;
for(int i=0;i<conditions.size()-1;i++){
array = new ArrayList<String>();
first = conditions.get(i).trim();
second = key.toString().trim();
array.add(first);
array.add(second);


tap = new TextArrayPair();
tap.setCount(value);
tap.setFields(array);

context.write(key,tap);//(后缀元素,频繁项集)=》reduce
context.write(new Text(first), tap);//(非后缀元素,频繁项集)=> reduce
}
}
}


}
};


}

public static class AssoRuleReduce extends Reducer<Text, TextArrayPair, Text, DoubleWritable>{
/**
* key作为分母,values作为分子;处理分母一样的集合,分子一样的取出现次数大的;分子==分母,计算分母出现次数
*  
* */
protected void reduce(Text key, Iterable<TextArrayPair> values,Context context) throws IOException ,InterruptedException {
System.out.println("key:"+key);

//迭代器中values放进HashMap中
Map<List<String>,Long> itemsMap = new HashMap<List<String>, Long>();
Long deniminator =1L;



for(TextArrayPair value:values){

for(String valStr:value.getFields()){
System.out.print(valStr+" ");
}
System.out.println(","+value.getCount());

if(value.getFields().size()==1){//1项集,该集合中频繁项出现次数即后缀元素出现次数
deniminator = value.getCount();
}else{//2项集合
if(itemsMap.containsKey(value.getFields())){//该频繁项集合已存在
if( value.getCount()>itemsMap.get(value.getFields())){//如果
System.out.println("重复值。。。。");
itemsMap.put(value.getFields(), value.getCount());
}

}else{//不存在
itemsMap.put(value.getFields(), value.getCount());
}
}
}

if(deniminator==null || deniminator<1){
System.out.println("=============数据有误,分母设置为最大值,conf将为0==================");
deniminator = Long.MAX_VALUE;
}

String first;
String second;
Double conf;
//计算关联项的conf
Set<List<String>> keyset = itemsMap.keySet();
Iterator<List<String>> it = keyset.iterator();
System.out.println("===========打印满足条件的关联项============");
while(it.hasNext()){
List<String> fields = it.next();

if(fields.size()==2){
first = fields.get(0);
second = fields.get(1);
conf = itemsMap.get(fields)*1.0/deniminator;


if(first.equals(key.toString())){
context.write(new Text(first+"=>"+second), new DoubleWritable(conf));
System.out.println(first+"=>"+second+","+conf);
}else{
context.write(new Text(second+"=>"+first), new DoubleWritable(conf));
System.out.println(second+"=>"+first+","+conf);
}
}

}



};
}



}

如有更简单的实现方法,希望不吝赐教!!!


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值