在调用 Mahout 算法包之前,我们先看下单机 使用FP-Tree算法计算关联规则;
FP-Tree算法的实现 可参考网址: http://www.cnblogs.com/zhangchaoyang/articles/2198946.html
这篇blog 详细的讲述了 FP-Tree 算法的理论实现,主要通过构造 FP-Tree树,计算后缀模式的条件模式基;通过迭代计算获取所有满足条件的关联规则项。
而调用 Mahout 算法包,我们发现频繁项结果只是第一次构建FP-Tree树产生的 后缀模式的条件模式基;显然,没法直接提取所有关联规则项。
网站资料有计算关联规则的方法:
原blog是英文版的, 查看需要翻墙:
根据上面资料代码实现:
调用Mahout算法包类
package myTesting.associate;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.fpm.pfpgrowth.PFPGrowth;
import org.apache.mahout.utils.SequenceFileDumper;
public class FPTreeTest {
public static void main(String[] args) throws Exception {
runFP();
printFP();
}
/**
* 调用FPTree 算法
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void runFP() throws IOException, InterruptedException, ClassNotFoundException{
//mahout fpg -i output.dat -o patterns -k 10 -method mapreduce -regex '[\ ]' -s 10
Parameters params = new Parameters();
params.set("minSupport","3");
params.set("maxHeapSize", "10");
params.set("numGroups", "1000");
params.set("treeCacheSize", "5");
params.set("splitPattern", " ");
String encoding = "UTF-8";
params.set("encoding", encoding);
params.set(PFPGrowth.USE_FPG2, "true");
Path inputDir = new Path("hdfs://192.168.9.72:9000/space_milk.dat");
Path outputDir = new Path("hdfs://192.168.9.72:9000/patterns");
params.set("input", inputDir.toString());
params.set("output", outputDir.toString());
Configuration conf = new Configuration();
HadoopUtil.delete(conf, outputDir);
PFPGrowth.runPFPGrowth(params);
}
/**
* 打印FP 计算结果
* @throws Exception
*/
public static void printFP() throws Exception{
// mahout seqdumper -i patterns/frequentpatterns/part-r-00000
SequenceFileDumper dumper = new SequenceFileDumper();
String[] args = new String[]{"-i","hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000"};
dumper.run(args);
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.fpm.pfpgrowth.PFPGrowth;
import org.apache.mahout.utils.SequenceFileDumper;
public class FPTreeTest {
public static void main(String[] args) throws Exception {
runFP();
printFP();
}
/**
* 调用FPTree 算法
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void runFP() throws IOException, InterruptedException, ClassNotFoundException{
//mahout fpg -i output.dat -o patterns -k 10 -method mapreduce -regex '[\ ]' -s 10
Parameters params = new Parameters();
params.set("minSupport","3");
params.set("maxHeapSize", "10");
params.set("numGroups", "1000");
params.set("treeCacheSize", "5");
params.set("splitPattern", " ");
String encoding = "UTF-8";
params.set("encoding", encoding);
params.set(PFPGrowth.USE_FPG2, "true");
Path inputDir = new Path("hdfs://192.168.9.72:9000/space_milk.dat");
Path outputDir = new Path("hdfs://192.168.9.72:9000/patterns");
params.set("input", inputDir.toString());
params.set("output", outputDir.toString());
Configuration conf = new Configuration();
HadoopUtil.delete(conf, outputDir);
PFPGrowth.runPFPGrowth(params);
}
/**
* 打印FP 计算结果
* @throws Exception
*/
public static void printFP() throws Exception{
// mahout seqdumper -i patterns/frequentpatterns/part-r-00000
SequenceFileDumper dumper = new SequenceFileDumper();
String[] args = new String[]{"-i","hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000"};
dumper.run(args);
}
}
计算关联规则类
package myTesting.associate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
public class ResultReader {
public static Map<Integer, Long> readFrequency(Configuration configuration, String fileName) throws Exception {
FileSystem fs = FileSystem.get(configuration);
Reader frequencyReader = new SequenceFile.Reader(fs,new Path(fileName), configuration);
Map<Integer, Long> frequency = new HashMap<Integer, Long>();
Text key = new Text();
LongWritable value = new LongWritable();
while(frequencyReader.next(key, value)) {
frequency.put(Integer.parseInt(key.toString()), value.get());
}
return frequency;
}
public static void readFrequentPatterns(
Configuration configuration,
String fileName,
int transactionCount,
Map<Integer, Long> frequency,
double minSupport, double minConfidence) throws Exception {
FileSystem fs = FileSystem.get(configuration);
//读取FP-Tree算法结果
Reader frequentPatternsReader = new SequenceFile.Reader(fs,
new Path(fileName), configuration);
Text key = new Text();
TopKStringPatterns value = new TopKStringPatterns();
while(frequentPatternsReader.next(key, value)) {
long firstFrequencyItem = -1;
String firstItemId = null;//后缀模式
List<Pair<List<String>, Long>> patterns = value.getPatterns();//条件模式基
int i = 0;
for(Pair<List<String>, Long> pair: patterns) {
//遍历后缀模式的每一个模式基
List<String> itemList = pair.getFirst();//模式基
Long occurrence = pair.getSecond();//模式基的次数
if (i == 0) {
firstFrequencyItem = occurrence;
firstItemId = itemList.get(0);
} else {
double support = (double)occurrence / transactionCount;
double confidence = (double)occurrence / firstFrequencyItem;
if ((support > minSupport && confidence > minConfidence)) {
List<String> listWithoutFirstItem = new ArrayList<String>();
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
listWithoutFirstItem.add(itemId);
}
}
String firstItem = firstItemId;
listWithoutFirstItem.remove(firstItemId);
System.out.printf(
"%s => %s: supp=%.3f, conf=%.3f",
listWithoutFirstItem,
firstItem,
support,
confidence);
if (itemList.size() == 2) {
// we can easily compute the lift and the conviction for set of
// size 2, so do it
int otherItemId = -1;
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
otherItemId = Integer.parseInt(itemId);
break;
}
}
long otherItemOccurrence = frequency.get(otherItemId);
double lift = (double)occurrence / (firstFrequencyItem * otherItemOccurrence);
double conviction = (1.0 - (double)otherItemOccurrence / transactionCount) / (1.0 - confidence);
System.out.printf(
", lift=%.3f, conviction=%.3f",
lift, conviction);
}
System.out.printf("\n");
}
}
i++;
}
}
frequentPatternsReader.close();
}
public static void main(String args[]) throws Exception {
int transactionCount = 9;//事务总数
String frequencyFilename = "hdfs://192.168.9.72:9000/patterns/fList";
String frequentPatternsFilename = "hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000";
double minSupport = 0.001;//支持度
double minConfidence = 0.3;//置信度
Configuration configuration = new Configuration();
//读取频繁一项集,key为一项集元素,value是对应出现次数
Map<Integer, Long> frequency = readFrequency(configuration, frequencyFilename);
//计算关联规则项
readFrequentPatterns(configuration, frequentPatternsFilename,
transactionCount, frequency, minSupport, minConfidence);
}
}
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.Pair;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
public class ResultReader {
public static Map<Integer, Long> readFrequency(Configuration configuration, String fileName) throws Exception {
FileSystem fs = FileSystem.get(configuration);
Reader frequencyReader = new SequenceFile.Reader(fs,new Path(fileName), configuration);
Map<Integer, Long> frequency = new HashMap<Integer, Long>();
Text key = new Text();
LongWritable value = new LongWritable();
while(frequencyReader.next(key, value)) {
frequency.put(Integer.parseInt(key.toString()), value.get());
}
return frequency;
}
public static void readFrequentPatterns(
Configuration configuration,
String fileName,
int transactionCount,
Map<Integer, Long> frequency,
double minSupport, double minConfidence) throws Exception {
FileSystem fs = FileSystem.get(configuration);
//读取FP-Tree算法结果
Reader frequentPatternsReader = new SequenceFile.Reader(fs,
new Path(fileName), configuration);
Text key = new Text();
TopKStringPatterns value = new TopKStringPatterns();
while(frequentPatternsReader.next(key, value)) {
long firstFrequencyItem = -1;
String firstItemId = null;//后缀模式
List<Pair<List<String>, Long>> patterns = value.getPatterns();//条件模式基
int i = 0;
for(Pair<List<String>, Long> pair: patterns) {
//遍历后缀模式的每一个模式基
List<String> itemList = pair.getFirst();//模式基
Long occurrence = pair.getSecond();//模式基的次数
if (i == 0) {
firstFrequencyItem = occurrence;
firstItemId = itemList.get(0);
} else {
double support = (double)occurrence / transactionCount;
double confidence = (double)occurrence / firstFrequencyItem;
if ((support > minSupport && confidence > minConfidence)) {
List<String> listWithoutFirstItem = new ArrayList<String>();
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
listWithoutFirstItem.add(itemId);
}
}
String firstItem = firstItemId;
listWithoutFirstItem.remove(firstItemId);
System.out.printf(
"%s => %s: supp=%.3f, conf=%.3f",
listWithoutFirstItem,
firstItem,
support,
confidence);
if (itemList.size() == 2) {
// we can easily compute the lift and the conviction for set of
// size 2, so do it
int otherItemId = -1;
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
otherItemId = Integer.parseInt(itemId);
break;
}
}
long otherItemOccurrence = frequency.get(otherItemId);
double lift = (double)occurrence / (firstFrequencyItem * otherItemOccurrence);
double conviction = (1.0 - (double)otherItemOccurrence / transactionCount) / (1.0 - confidence);
System.out.printf(
", lift=%.3f, conviction=%.3f",
lift, conviction);
}
System.out.printf("\n");
}
}
i++;
}
}
frequentPatternsReader.close();
}
public static void main(String args[]) throws Exception {
int transactionCount = 9;//事务总数
String frequencyFilename = "hdfs://192.168.9.72:9000/patterns/fList";
String frequentPatternsFilename = "hdfs://192.168.9.72:9000/patterns/frequentpatterns/part-r-00000";
double minSupport = 0.001;//支持度
double minConfidence = 0.3;//置信度
Configuration configuration = new Configuration();
//读取频繁一项集,key为一项集元素,value是对应出现次数
Map<Integer, Long> frequency = readFrequency(configuration, frequencyFilename);
//计算关联规则项
readFrequentPatterns(configuration, frequentPatternsFilename,
transactionCount, frequency, minSupport, minConfidence);
}
}
测试结果,Mahout算法计算结果数据:
Key: 1: Value: ([1],6), ([4, 1],5), ([3, 1],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4, 1],3)
Key: 2: Value: ([2],7), ([2, 4],6), ([2, 3],5), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([2, 5],3)
Key: 3: Value: ([3],7), ([3, 4],5), ([3, 1],5), ([2, 3],5), ([3, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([3, 5],3)
Key: 4: Value: ([4],7), ([2, 4],6), ([4, 1],5), ([3, 4],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3)
Key: 5: Value: ([5],4), ([3, 5],3), ([2, 5],3)
Key: 2: Value: ([2],7), ([2, 4],6), ([2, 3],5), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([2, 5],3)
Key: 3: Value: ([3],7), ([3, 4],5), ([3, 1],5), ([2, 3],5), ([3, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3), ([3, 5],3)
Key: 4: Value: ([4],7), ([2, 4],6), ([4, 1],5), ([3, 4],5), ([3, 4, 1],4), ([2, 4, 1],4), ([2, 3, 4],4), ([2, 3, 4, 1],3)
Key: 5: Value: ([5],4), ([3, 5],3), ([2, 5],3)
计算所得的所有关联规则项:
[4] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3, 4] => 1: supp=0.444, conf=0.667
[2, 4] => 1: supp=0.444, conf=0.667
[2, 3, 4] => 1: supp=0.333, conf=0.500
[4] => 2: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[3] => 2: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 2: supp=0.444, conf=0.571
[3, 4] => 2: supp=0.444, conf=0.571
[3, 4, 1] => 2: supp=0.333, conf=0.429
[5] => 2: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[4] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[1] => 3: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[2] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 3: supp=0.444, conf=0.571
[2, 4] => 3: supp=0.444, conf=0.571
[2, 4, 1] => 3: supp=0.333, conf=0.429
[5] => 3: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[2] => 4: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[1] => 4: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[3] => 4: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[3, 1] => 4: supp=0.444, conf=0.571
[2, 1] => 4: supp=0.444, conf=0.571
[2, 3] => 4: supp=0.444, conf=0.571
[2, 3, 1] => 4: supp=0.333, conf=0.429
[3] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889
[2] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889
[3] => 1: supp=0.556, conf=0.833, lift=0.119, conviction=1.333
[3, 4] => 1: supp=0.444, conf=0.667
[2, 4] => 1: supp=0.444, conf=0.667
[2, 3, 4] => 1: supp=0.333, conf=0.500
[4] => 2: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[3] => 2: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 2: supp=0.444, conf=0.571
[3, 4] => 2: supp=0.444, conf=0.571
[3, 4, 1] => 2: supp=0.333, conf=0.429
[5] => 2: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[4] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[1] => 3: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[2] => 3: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[4, 1] => 3: supp=0.444, conf=0.571
[2, 4] => 3: supp=0.444, conf=0.571
[2, 4, 1] => 3: supp=0.333, conf=0.429
[5] => 3: supp=0.333, conf=0.429, lift=0.107, conviction=0.972
[2] => 4: supp=0.667, conf=0.857, lift=0.122, conviction=1.556
[1] => 4: supp=0.556, conf=0.714, lift=0.119, conviction=1.167
[3] => 4: supp=0.556, conf=0.714, lift=0.102, conviction=0.778
[3, 1] => 4: supp=0.444, conf=0.571
[2, 1] => 4: supp=0.444, conf=0.571
[2, 3] => 4: supp=0.444, conf=0.571
[2, 3, 1] => 4: supp=0.333, conf=0.429
[3] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889
[2] => 5: supp=0.333, conf=0.750, lift=0.107, conviction=0.889
分析发现,([2, 4, 1],4)是满足条件的频繁项,那么它的所有两项集子集是满足条件的频繁项;所以2=》1 或者 1=》2关联规则项应该是满足条件的;很显然根据上面的关联规则的算法没法实现。
由于我计算的只是 两个元素的之间的关联规则,所以写了一个简单的MR实现它:
public class FPTreeAssoRuleMakerJob {
private static final String MY_FREQUENT="myfrequent";
public static void makeSimpleAssoRule(Parameters params) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
String outputPath = params.get("output");
Path input = new Path(outputPath, PFPGrowth.FP_GROWTH);
Job job = new Job(conf, "AssoRule Driver running over input: " + input);
job.setJarByClass(FPTreeAssoRuleMakerJob.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TextArrayPair.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, input);
Path outPath = new Path(outputPath,MY_FREQUENT);
FileOutputFormat.setOutputPath(job, outPath);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(AssoRuleMap.class);
// job.setCombinerClass(Reduce.class);
job.setReducerClass(AssoRuleReduce.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class);
HadoopUtil.delete(conf, outPath);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
//完成标记文件
FileSystem fs = FileSystem.get(URI.create(outputPath), conf);
HdfsUtils.createOverFile(fs,outputPath+Path.SEPARATOR+Constants.mark_UserGroupsAnalysis , null);
}
public static class AssoRuleMap extends Mapper<Text, TopKStringPatterns, Text, TextArrayPair>{
/**
* 根据FP-tree结果树,提取所有可能的频繁一项集、频繁二项集;并以一项集或者二项集中的某个元素作为key,<br>
* 对应的频繁项集作为value(value为自定义数据类型),传给reduce处理
*/
protected void map(Text key, TopKStringPatterns values, Context context) throws java.io.IOException ,InterruptedException {
System.out.println("key:"+key);
System.out.println("values:"+values);
List<Pair<List<String>,Long>> patterns = values.getPatterns();
TextArrayPair tap;
ArrayList<String> array;
for(Pair<List<String>,Long> pattern:patterns){
List<String> conditions = pattern.getFirst();
Long value = pattern.getSecond();
if(conditions.size()==1){//1项集
tap = new TextArrayPair();
tap.setFields(conditions);
tap.setCount(value);
context.write(key,tap);
}else{
//最后一个元素为后缀元素
if(conditions.get(conditions.size()-1).equals(key.toString())){
String first;
String second;
for(int i=0;i<conditions.size()-1;i++){
array = new ArrayList<String>();
first = conditions.get(i).trim();
second = key.toString().trim();
array.add(first);
array.add(second);
tap = new TextArrayPair();
tap.setCount(value);
tap.setFields(array);
context.write(key,tap);//(后缀元素,频繁项集)=》reduce
context.write(new Text(first), tap);//(非后缀元素,频繁项集)=> reduce
}
}
}
}
};
}
public static class AssoRuleReduce extends Reducer<Text, TextArrayPair, Text, DoubleWritable>{
/**
* key作为分母,values作为分子;处理分母一样的集合,分子一样的取出现次数大的;分子==分母,计算分母出现次数
*
* */
protected void reduce(Text key, Iterable<TextArrayPair> values,Context context) throws IOException ,InterruptedException {
System.out.println("key:"+key);
//迭代器中values放进HashMap中
Map<List<String>,Long> itemsMap = new HashMap<List<String>, Long>();
Long deniminator =1L;
for(TextArrayPair value:values){
for(String valStr:value.getFields()){
System.out.print(valStr+" ");
}
System.out.println(","+value.getCount());
if(value.getFields().size()==1){//1项集,该集合中频繁项出现次数即后缀元素出现次数
deniminator = value.getCount();
}else{//2项集合
if(itemsMap.containsKey(value.getFields())){//该频繁项集合已存在
if( value.getCount()>itemsMap.get(value.getFields())){//如果
System.out.println("重复值。。。。");
itemsMap.put(value.getFields(), value.getCount());
}
}else{//不存在
itemsMap.put(value.getFields(), value.getCount());
}
}
}
if(deniminator==null || deniminator<1){
System.out.println("=============数据有误,分母设置为最大值,conf将为0==================");
deniminator = Long.MAX_VALUE;
}
String first;
String second;
Double conf;
//计算关联项的conf
Set<List<String>> keyset = itemsMap.keySet();
Iterator<List<String>> it = keyset.iterator();
System.out.println("===========打印满足条件的关联项============");
while(it.hasNext()){
List<String> fields = it.next();
if(fields.size()==2){
first = fields.get(0);
second = fields.get(1);
conf = itemsMap.get(fields)*1.0/deniminator;
if(first.equals(key.toString())){
context.write(new Text(first+"=>"+second), new DoubleWritable(conf));
System.out.println(first+"=>"+second+","+conf);
}else{
context.write(new Text(second+"=>"+first), new DoubleWritable(conf));
System.out.println(second+"=>"+first+","+conf);
}
}
}
};
}
}
private static final String MY_FREQUENT="myfrequent";
public static void makeSimpleAssoRule(Parameters params) throws IOException, InterruptedException, ClassNotFoundException{
Configuration conf = new Configuration();
conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
String outputPath = params.get("output");
Path input = new Path(outputPath, PFPGrowth.FP_GROWTH);
Job job = new Job(conf, "AssoRule Driver running over input: " + input);
job.setJarByClass(FPTreeAssoRuleMakerJob.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TextArrayPair.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
FileInputFormat.addInputPath(job, input);
Path outPath = new Path(outputPath,MY_FREQUENT);
FileOutputFormat.setOutputPath(job, outPath);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(AssoRuleMap.class);
// job.setCombinerClass(Reduce.class);
job.setReducerClass(AssoRuleReduce.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class);
HadoopUtil.delete(conf, outPath);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
//完成标记文件
FileSystem fs = FileSystem.get(URI.create(outputPath), conf);
HdfsUtils.createOverFile(fs,outputPath+Path.SEPARATOR+Constants.mark_UserGroupsAnalysis , null);
}
public static class AssoRuleMap extends Mapper<Text, TopKStringPatterns, Text, TextArrayPair>{
/**
* 根据FP-tree结果树,提取所有可能的频繁一项集、频繁二项集;并以一项集或者二项集中的某个元素作为key,<br>
* 对应的频繁项集作为value(value为自定义数据类型),传给reduce处理
*/
protected void map(Text key, TopKStringPatterns values, Context context) throws java.io.IOException ,InterruptedException {
System.out.println("key:"+key);
System.out.println("values:"+values);
List<Pair<List<String>,Long>> patterns = values.getPatterns();
TextArrayPair tap;
ArrayList<String> array;
for(Pair<List<String>,Long> pattern:patterns){
List<String> conditions = pattern.getFirst();
Long value = pattern.getSecond();
if(conditions.size()==1){//1项集
tap = new TextArrayPair();
tap.setFields(conditions);
tap.setCount(value);
context.write(key,tap);
}else{
//最后一个元素为后缀元素
if(conditions.get(conditions.size()-1).equals(key.toString())){
String first;
String second;
for(int i=0;i<conditions.size()-1;i++){
array = new ArrayList<String>();
first = conditions.get(i).trim();
second = key.toString().trim();
array.add(first);
array.add(second);
tap = new TextArrayPair();
tap.setCount(value);
tap.setFields(array);
context.write(key,tap);//(后缀元素,频繁项集)=》reduce
context.write(new Text(first), tap);//(非后缀元素,频繁项集)=> reduce
}
}
}
}
};
}
public static class AssoRuleReduce extends Reducer<Text, TextArrayPair, Text, DoubleWritable>{
/**
* key作为分母,values作为分子;处理分母一样的集合,分子一样的取出现次数大的;分子==分母,计算分母出现次数
*
* */
protected void reduce(Text key, Iterable<TextArrayPair> values,Context context) throws IOException ,InterruptedException {
System.out.println("key:"+key);
//迭代器中values放进HashMap中
Map<List<String>,Long> itemsMap = new HashMap<List<String>, Long>();
Long deniminator =1L;
for(TextArrayPair value:values){
for(String valStr:value.getFields()){
System.out.print(valStr+" ");
}
System.out.println(","+value.getCount());
if(value.getFields().size()==1){//1项集,该集合中频繁项出现次数即后缀元素出现次数
deniminator = value.getCount();
}else{//2项集合
if(itemsMap.containsKey(value.getFields())){//该频繁项集合已存在
if( value.getCount()>itemsMap.get(value.getFields())){//如果
System.out.println("重复值。。。。");
itemsMap.put(value.getFields(), value.getCount());
}
}else{//不存在
itemsMap.put(value.getFields(), value.getCount());
}
}
}
if(deniminator==null || deniminator<1){
System.out.println("=============数据有误,分母设置为最大值,conf将为0==================");
deniminator = Long.MAX_VALUE;
}
String first;
String second;
Double conf;
//计算关联项的conf
Set<List<String>> keyset = itemsMap.keySet();
Iterator<List<String>> it = keyset.iterator();
System.out.println("===========打印满足条件的关联项============");
while(it.hasNext()){
List<String> fields = it.next();
if(fields.size()==2){
first = fields.get(0);
second = fields.get(1);
conf = itemsMap.get(fields)*1.0/deniminator;
if(first.equals(key.toString())){
context.write(new Text(first+"=>"+second), new DoubleWritable(conf));
System.out.println(first+"=>"+second+","+conf);
}else{
context.write(new Text(second+"=>"+first), new DoubleWritable(conf));
System.out.println(second+"=>"+first+","+conf);
}
}
}
};
}
}
如有更简单的实现方法,希望不吝赐教!!!