最近在用mahout基于内存的应用关联规则推荐算法,发现网上的代码资料很少,所以把代码写下来
double transactionCount=5354;//训练集购物篮数据总数
int minSupport = 3;
int maxHeapSize = 50;
String input = "./models/text/action.dat";
String output = "output";
FPGrowth<String> fp = new FPGrowth<String>();
FileSystem fs = new RawLocalFileSystem();
Configuration conf = new Configuration();
String pattern = "[\\ ]";
try {
fs = FileSystem.get(conf);
SequenceFile.Writer writer = null;
writer = new SequenceFile.Writer(fs, conf,new Path(output),Text.class, TopKStringPatterns.class);
Charset encoding = Charset.forName("UTF-8");
List<Pair<String, Long>> generateFList = null;
StringRecordIterator transactions = null;
StringRecordIterator fptransactions =null;
transactions = new StringRecordIterator(new FileLineIterable(new
File(input), encoding, false), pattern);
fptransactions=transactions;
System.out.print("\nthe end! ");
generateFList = fp.generateFList(new StringRecordIterator(new FileLineIterable(new File(input), encoding,false),
pattern), minSupport);
StatusUpdater updater = new StatusUpdater(){
@Override
public void update(String status) {
}
};
List<Text> keyList = new LinkedList<Text>();
List<TopKStringPatterns> valueList = new
LinkedList<TopKStringPatterns>();
StringOutputCollector<Text, TopKStringPatterns> collector =new StringOutputCollector<Text, TopKStringPatterns>(keyList, valueList);
StringOutputConverter soc = new StringOutputConverter(collector);
System.out.print("\n 开始! ");
fp.generateTopKFrequentPatterns(transactions, generateFList,minSupport, maxHeapSize,null, soc, updater);
writer.close();
fs.close();
HashSet<List<String>> unique =new HashSet<List<String>>();
for (int i = 0; i < valueList.size(); i++) {
long firstFrequencyItem = -1;
String firstItemId = null;
System.out.println(keyList.get(i) +" / " + valueList.get(i));
List<Pair<List<String>, Long>> iterator = valueList.get(i).getPatterns();
int j=0;
for(Pair<List<String>, Long> pair: iterator) {
List<String> itemList = pair.getFirst();
Long occurrence = pair.getSecond();
if (j == 0) {
firstFrequencyItem = occurrence;
firstItemId = itemList.get(0)+"";
} else {
double support = (double)occurrence / transactionCount;
double confidence = (double)occurrence / firstFrequencyItem;
List<String> listWithoutFirstItem = new ArrayList();
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
listWithoutFirstItem.add(itemId);
}
}
listWithoutFirstItem.remove(firstItemId);
System.out.printf("%s => %s: supp=%.3f, conf=%.3f",listWithoutFirstItem,firstItemId,support,confidence);
if (itemList.size() == 2) {
intotherItemId = -1;
for(String itemId: itemList) {
if (!itemId.equals(firstItemId)) {
otherItemId = Integer.parseInt(itemId);
break;
}
}
}
System.out.printf("\n");
}
j++;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
该算法是java改写mahout的org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver内启动项,先生成频繁项,这里把mahout应该生成的频繁项文件output改为放到soc对象中去,通过生成的频繁项 再去生成支持度和置信度