根据Mahout FPGrowth算法生成的结果计算关联规则项

最新推荐文章于 2018-05-23 16:11:00 发布

奥康姆剃刀

最新推荐文章于 2018-05-23 16:11:00 发布

阅读量1.3k

点赞数

文章标签： Mahout FPGrowth 计算关联规则

本文链接：https://blog.csdn.net/zhoujianfeng3/article/details/37657425

版权

在调用 Mahout 算法包之前，我们先看下单机使用FP-Tree算法计算关联规则；

FP-Tree算法的实现可参考网址： http://www.cnblogs.com/zhangchaoyang/articles/2198946.html

这篇blog 详细的讲述了 FP-Tree 算法的理论实现，主要通过构造 FP-Tree树，计算后缀模式的条件模式基；通过迭代计算获取所有满足条件的关联规则项。

而调用 Mahout 算法包，我们发现频繁项结果只是第一次构建FP-Tree树产生的后缀模式的条件模式基；显然，没法直接提取所有关联规则项。

网站资料有计算关联规则的方法：

http://www.cnblogs.com/ValiancyHe/archive/2013/07/06/3174944.html

原blog是英文版的，查看需要翻墙：

http://chimpler.wordpress.com/2013/05/02/finding-association-rules-with-mahout-frequent-pattern-mining/

根据上面资料代码实现：

调用Mahout算法包类

package myTesting.associate;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.fpm.pfpgrowth.PFPGrowth;
import org.apache.mahout.utils.SequenceFileDumper;

public class FPTreeTest {

public static void main(String[] args) throws Exception {

runFP();

printFP();

}
/**
* 调用FPTree 算法
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void runFP() throws IOException, InterruptedException, ClassNotFoundException{
//mahout fpg -i output.dat -o patterns -k 10 -method mapreduce -regex '[\ ]' -s 10

Parameters params = new Parameters();
params.set("minSupport","3");
params.set("maxHeapSize", "10");
params.set("numGroups", "1000");
params.set("treeCacheSize", "5");
params.set("splitPattern", " ");

String encoding = "UTF-8";
params.set("encoding", encoding);

params.set(PFPGrowth.USE_FPG2, "true");

Path inputDir = new Path("hdfs://192.168.9.72:9000/space_milk.dat");
Path outputDir = new Path("hdfs://192.168.9.72:9000/patterns");
params.set("input", inputDir.toString());
params.set("output", outputDir.toString());

Configuration conf = new Configuration();
HadoopUtil.delete(conf, outputDir);
PFPGrowth.runPFPGrowth(params);
}
/**
* 打印FP 计算结果
* @throws Exception
*/
public static void printFP() throws Exception{
// mahout seqdumper -i patterns/frequentpatterns/part-r-00000

SequenceFileDumper dumper = new SequenceFileDumper();
String[] args = new String[]{"-i","hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000"};
dumper.run(args);

}

}

计算关联规则类

package myTesting.associate;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;

public class ResultReader {
public static Map<Integer, Long> readFrequency(Configuration configuration, String fileName) throws Exception {
FileSystem fs = FileSystem.get(configuration);
Reader frequencyReader = new SequenceFile.Reader(fs,new Path(fileName), configuration);
Map<Integer, Long> frequency = new HashMap<Integer, Long>();
Text key = new Text();
LongWritable value = new LongWritable();

while(frequencyReader.next(key, value)) {
frequency.put(Integer.parseInt(key.toString()), value.get());
}
return frequency;

}

public static void readFrequentPatterns(
Configuration configuration,
String fileName,
int transactionCount,
Map<Integer, Long> frequency,
double minSupport, double minConfidence) throws Exception {
FileSystem fs = FileSystem.get(configuration);

//读取FP-Tree算法结果
Reader frequentPatternsReader = new SequenceFile.Reader(fs,
new Path(fileName), configuration);
Text key = new Text();
TopKStringPatterns value = new TopKStringPatterns();

while(frequentPatternsReader.next(key, value)) {
long firstFrequencyItem = -1;
String firstItemId = null;//后缀模式
List<Pair<List<String>, Long>> patterns = value.getPatterns();//条件模式基
int i = 0;
for(Pair<List<String>, Long> pair: patterns) {
//遍历后缀模式的每一个模式基
List<String> itemList = pair.getFirst();//模式基
Long occurrence = pair.getSecond();//模式基的次数
if (i == 0) {
firstFrequencyItem = occurrence;
firstItemId = itemList.get(0);
} else {
double support = (double)occurrence / transactionCount;
double confidence = (double)occurrence / firstFrequencyItem;

if ((support > minSupport && confidence > minConfidence)) {
List<String> listWithoutFirstItem = new ArrayList<String>();
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {

listWithoutFirstItem.add(itemId);
}
}
String firstItem = firstItemId;
listWithoutFirstItem.remove(firstItemId);
System.out.printf(
"%s => %s: supp=%.3f, conf=%.3f",
listWithoutFirstItem,
firstItem,
support,
confidence);

if (itemList.size() == 2) {
// we can easily compute the lift and the conviction for set of
// size 2, so do it
int otherItemId = -1;
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
otherItemId = Integer.parseInt(itemId);
break;
}
}
long otherItemOccurrence = frequency.get(otherItemId);
double lift = (double)occurrence / (firstFrequencyItem * otherItemOccurrence);
double conviction = (1.0 - (double)otherItemOccurrence / transactionCount) / (1.0 - confidence);
System.out.printf(
", lift=%.3f, conviction=%.3f",
lift, conviction);
}
System.out.printf("\n");
}
}
i++;
}
}
frequentPatternsReader.close();

}

public static void main(String args[]) throws Exception {

int transactionCount = 9;//事务总数
String frequencyFilename = "hdfs://192.168.9.72:9000/patterns/fList";

String frequentPatternsFilename = "hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000";

double minSupport = 0.001;//支持度
double minConfidence = 0.3;//置信度

Configuration configuration = new Configuration();
//读取频繁一项集，key为一项集元素，value是对应出现次数
Map<Integer, Long> frequency = readFrequency(configuration, frequencyFilename);

//计算关联规则项
readFrequentPatterns(configuration, frequentPatternsFilename,
transactionCount, frequency, minSupport, minConfidence);

}
}

测试结果，Mahout算法计算结果数据：

Key: 1: Value: ([1],6), ([4, 1],5), ([3, 1],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4, 1],3)
Key: 2: Value: ([2],7), ([2, 4],6), ([2, 3],5), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([2, 5],3)
Key: 3: Value: ([3],7), ([3, 4],5), ([3, 1],5), ([2, 3],5), ([3, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([3, 5],3)
Key: 4: Value: ([4],7), ([2, 4],6), ([4, 1],5), ([3, 4],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3)
Key: 5: Value: ([5],4), ([3, 5],3), ([2, 5],3)

计算所得的所有关联规则项：

[4] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3, 4] => 1: supp=0.444, conf=0.667
[2, 4] => 1: supp=0.444, conf=0.667
[2, 3, 4] => 1: supp=0.333, conf=0.500
[4] => 2: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[3] => 2: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 2: supp=0.444, conf=0.571
[3, 4] => 2: supp=0.444, conf=0.571
[3, 4, 1] => 2: supp=0.333, conf=0.429
[5] => 2: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[4] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[1] => 3: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[2] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 3: supp=0.444, conf=0.571
[2, 4] => 3: supp=0.444, conf=0.571
[2, 4, 1] => 3: supp=0.333, conf=0.429
[5] => 3: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[2] => 4: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[1] => 4: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[3] => 4: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[3, 1] => 4: supp=0.444, conf=0.571
[2, 1] => 4: supp=0.444, conf=0.571
[2, 3] => 4: supp=0.444, conf=0.571
[2, 3, 1] => 4: supp=0.333, conf=0.429
[3] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889
[2] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889

分析发现，([2, 4, 1],4)是满足条件的频繁项，那么它的所有两项集子集是满足条件的频繁项；所以2=》1 或者 1=》2关联规则项应该是满足条件的；很显然根据上面的关联规则的算法没法实现。

由于我计算的只是两个元素的之间的关联规则，所以写了一个简单的MR实现它：

public class FPTreeAssoRuleMakerJob {

private static final String MY_FREQUENT="myfrequent";

public static void makeSimpleAssoRule(Parameters params) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");

String outputPath = params.get("output");

Path input = new Path(outputPath, PFPGrowth.FP_GROWTH);
Job job = new Job(conf, "AssoRule Driver running over input: " + input);
job.setJarByClass(FPTreeAssoRuleMakerJob.class);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TextArrayPair.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);

FileInputFormat.addInputPath(job, input);
Path outPath = new Path(outputPath,MY_FREQUENT);
FileOutputFormat.setOutputPath(job, outPath);

job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(AssoRuleMap.class);
// job.setCombinerClass(Reduce.class);
job.setReducerClass(AssoRuleReduce.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class);

HadoopUtil.delete(conf, outPath);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}

//完成标记文件
FileSystem fs = FileSystem.get(URI.create(outputPath), conf);
HdfsUtils.createOverFile(fs,outputPath+Path.SEPARATOR+Constants.mark_UserGroupsAnalysis , null);

}

public static class AssoRuleMap extends Mapper<Text, TopKStringPatterns, Text, TextArrayPair>{

/**
* 根据FP-tree结果树，提取所有可能的频繁一项集、频繁二项集；并以一项集或者二项集中的某个元素作为key，<br>
* 对应的频繁项集作为value（value为自定义数据类型），传给reduce处理
*/
protected void map(Text key, TopKStringPatterns values, Context context) throws java.io.IOException ,InterruptedException {

System.out.println("key:"+key);
System.out.println("values:"+values);
List<Pair<List<String>,Long>> patterns = values.getPatterns();
TextArrayPair tap;
ArrayList<String> array;
for(Pair<List<String>,Long> pattern:patterns){
List<String> conditions = pattern.getFirst();
Long value = pattern.getSecond();

if(conditions.size()==1){//1项集
tap = new TextArrayPair();

tap.setFields(conditions);
tap.setCount(value);

context.write(key,tap);
}else{
//最后一个元素为后缀元素
if(conditions.get(conditions.size()-1).equals(key.toString())){
String first;
String second;
for(int i=0;i<conditions.size()-1;i++){
array = new ArrayList<String>();
first = conditions.get(i).trim();
second = key.toString().trim();
array.add(first);
array.add(second);

tap = new TextArrayPair();
tap.setCount(value);
tap.setFields(array);

context.write(key,tap);//（后缀元素，频繁项集）=》reduce
context.write(new Text(first), tap);//（非后缀元素，频繁项集）=> reduce
}
}
}

}
};

}

public static class AssoRuleReduce extends Reducer<Text, TextArrayPair, Text, DoubleWritable>{
/**
* key作为分母，values作为分子；处理分母一样的集合，分子一样的取出现次数大的；分子==分母，计算分母出现次数
*
* */
protected void reduce(Text key, Iterable<TextArrayPair> values,Context context) throws IOException ,InterruptedException {
System.out.println("key:"+key);

//迭代器中values放进HashMap中
Map<List<String>,Long> itemsMap = new HashMap<List<String>, Long>();
Long deniminator =1L;

for(TextArrayPair value:values){

for(String valStr:value.getFields()){
System.out.print(valStr+" ");
}
System.out.println(","+value.getCount());

if(value.getFields().size()==1){//1项集，该集合中频繁项出现次数即后缀元素出现次数
deniminator = value.getCount();
}else{//2项集合
if(itemsMap.containsKey(value.getFields())){//该频繁项集合已存在
if( value.getCount()>itemsMap.get(value.getFields())){//如果
System.out.println("重复值。。。。");
itemsMap.put(value.getFields(), value.getCount());
}

}else{//不存在
itemsMap.put(value.getFields(), value.getCount());
}
}
}

if(deniminator==null || deniminator<1){
System.out.println("=============数据有误，分母设置为最大值，conf将为0==================");
deniminator = Long.MAX_VALUE;
}

String first;
String second;
Double conf;
//计算关联项的conf
Set<List<String>> keyset = itemsMap.keySet();
Iterator<List<String>> it = keyset.iterator();
System.out.println("===========打印满足条件的关联项============");
while(it.hasNext()){
List<String> fields = it.next();

if(fields.size()==2){
first = fields.get(0);
second = fields.get(1);
conf = itemsMap.get(fields)*1.0/deniminator;

if(first.equals(key.toString())){
context.write(new Text(first+"=>"+second), new DoubleWritable(conf));
System.out.println(first+"=>"+second+","+conf);
}else{
context.write(new Text(second+"=>"+first), new DoubleWritable(conf));
System.out.println(second+"=>"+first+","+conf);
}
}

}

};
}

}

如有更简单的实现方法，希望不吝赐教！！！