关联分析是用来做什么的?这边有一个经典的例子“超市购物单”,文件market内容如下:
牛奶,鸡蛋,面包,薯片
鸡蛋,爆米花,薯片,啤酒
鸡蛋,面包,薯片
牛奶,鸡蛋,面包,爆米花,薯片,啤酒
牛奶,面包,啤酒
鸡蛋,面包,啤酒
牛奶,面包,薯片
牛奶,鸡蛋,面包,黄油,薯片
牛奶,鸡蛋,黄油,薯片
每一行可以看作一个购物单,关联分析就是用来分析哪些物品经常会被同时购买(也就是关联度较大)。
其中一个分析算法的java实现如下:
TreeNode节点实现
import java.util.ArrayList;
import java.util.List;
public class TreeNode implements Comparable<TreeNode> {
private String name; // 节点名称
private int count; // 计数
private TreeNode parent; // 父节点
private List<TreeNode> children; // 子节点
private TreeNode nextHomonym; // 下一个同名节点
public TreeNode() {
}
public TreeNode(String name) {
this.name = name;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
public TreeNode getParent() {
return parent;
}
public void setParent(TreeNode parent) {
this.parent = parent;
}
public List<TreeNode> getChildren() {
return children;
}
public void addChild(TreeNode child) {
if (this.getChildren() == null) {
List<TreeNode> list = new ArrayList<TreeNode>();
list.add(child);
this.setChildren(list);
} else {
this.getChildren().add(child);
}
}
public TreeNode findChild(String name) {
List<TreeNode> children = this.getChildren();
if (children != null) {
for (TreeNode child : children) {
if (child.getName().equals(name)) {
return child;
}
}
}
return null;
}
public void setChildren(List<TreeNode> children) {
this.children = children;
}
public void printChildrenName() {
List<TreeNode> children = this.getChildren();
if (children != null) {
for (TreeNode child : children) {
System.out.print(child.getName() + " ");
}
} else {
System.out.print("null");
}
}
public TreeNode getNextHomonym() {
return nextHomonym;
}
public void setNextHomonym(TreeNode nextHomonym) {
this.nextHomonym = nextHomonym;
}
public void countIncrement(int n) {
this.count += n;
}
@Override
public int compareTo(TreeNode arg0) {
// TODO Auto-generated method stub
int count0 = arg0.getCount();
// 跟默认的比较大小相反,导致调用Arrays.sort()时是按降序排列
return count0 - this.count;
}
}
FPTree实现
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class FPTree {
private int minSuport;
public int getMinSuport() {
return minSuport;
}
public void setMinSuport(int minSuport) {
this.minSuport = minSuport;
}
// 从若干个文件中读入Transaction Record
public List<List<String>> readTransRocords(String... filenames) {
List<List<String>> transaction = null;
if (filenames.length > 0) {
transaction = new LinkedList<List<String>>();
for (String filename : filenames) {
try {
FileReader fr = new FileReader(filename);
BufferedReader br = new BufferedReader(fr);
try {
String line;
List<String> record;
while ((line = br.readLine()) != null) {
if (line.trim().length() > 0) {
String str[] = line.split(",");
record = new LinkedList<String>();
for (String w : str)
record.add(w);
transaction.add(record);
}
}
} finally {
br.close();
}
} catch (IOException ex) {
System.out.println("Read transaction records failed."
+ ex.getMessage());
System.exit(1);
}
}
}
return transaction;
}
// FP-Growth算法
public void FPGrowth(List<List<String>> transRecords,
List<String> postPattern) {
// 构建项头表,同时也是频繁1项集
ArrayList<TreeNode> HeaderTable = buildHeaderTable(transRecords);
// 构建FP-Tree
TreeNode treeRoot = buildFPTree(transRecords, HeaderTable);
// 如果FP-Tree为空则返回
if (treeRoot.getChildren() == null
|| treeRoot.getChildren().size() == 0)
return;
// 输出项头表的每一项+postPattern
if (postPattern != null) {
for (TreeNode header : HeaderTable) {
System.out.print(header.getCount() + "\t" + header.getName());
for (String ele : postPattern)
System.out.print("\t" + ele);
System.out.println();
}
}
// 找到项头表的每一项的条件模式基,进入递归迭代
for (TreeNode header : HeaderTable) {
// 后缀模式增加一项
List<String> newPostPattern = new LinkedList<String>();
newPostPattern.add(header.getName());
if (postPattern != null)
newPostPattern.addAll(postPattern);
// 寻找header的条件模式基CPB,放入newTransRecords中
List<List<String>> newTransRecords = new LinkedList<List<String>>();
TreeNode backnode = header.getNextHomonym();
while (backnode != null) {
int counter = backnode.getCount();
List<String> prenodes = new ArrayList<String>();
TreeNode parent = backnode;
// 遍历backnode的祖先节点,放到prenodes中
while ((parent = parent.getParent()).getName() != null) {
prenodes.add(parent.getName());
}
while (counter-- > 0) {
newTransRecords.add(prenodes);
}
backnode = backnode.getNextHomonym();
}
// 递归迭代
FPGrowth(newTransRecords, newPostPattern);
}
}
// 构建项头表,同时也是频繁1项集
public ArrayList<TreeNode> buildHeaderTable(List<List<String>> transRecords) {
ArrayList<TreeNode> F1 = null;
if (transRecords.size() > 0) {
F1 = new ArrayList<TreeNode>();
Map<String, TreeNode> map = new HashMap<String, TreeNode>();
// 计算事务数据库中各项的支持度
for (List<String> record : transRecords) {
for (String item : record) {
if (!map.keySet().contains(item)) {
TreeNode node = new TreeNode(item);
node.setCount(1);
map.put(item, node);
} else {
map.get(item).countIncrement(1);
}
}
}
// 把支持度大于(或等于)minSup的项加入到F1中
Set<String> names = map.keySet();
for (String name : names) {
TreeNode tnode = map.get(name);
if (tnode.getCount() >= minSuport) {
F1.add(tnode);
}
}
Collections.sort(F1);
return F1;
} else {
return null;
}
}
// 构建FP-Tree
public TreeNode buildFPTree(List<List<String>> transRecords,
ArrayList<TreeNode> F1) {
TreeNode root = new TreeNode(); // 创建树的根节点
for (List<String> transRecord : transRecords) {
LinkedList<String> record = sortByF1(transRecord, F1);
TreeNode subTreeRoot = root;
TreeNode tmpRoot = null;
if (root.getChildren() != null) {
while (!record.isEmpty()
&& (tmpRoot = subTreeRoot.findChild(record.peek())) != null) {
tmpRoot.countIncrement(1);
subTreeRoot = tmpRoot;
record.poll();
}
}
addNodes(subTreeRoot, record, F1);
}
return root;
}
// 把交易记录按项的频繁程序降序排列
public LinkedList<String> sortByF1(List<String> transRecord,
ArrayList<TreeNode> F1) {
Map<String, Integer> map = new HashMap<String, Integer>();
for (String item : transRecord) {
// 由于F1已经是按降序排列的,
for (int i = 0; i < F1.size(); i++) {
TreeNode tnode = F1.get(i);
if (tnode.getName().equals(item)) {
map.put(item, i);
}
}
}
ArrayList<Entry<String, Integer>> al = new ArrayList<Entry<String, Integer>>(
map.entrySet());
Collections.sort(al, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Entry<String, Integer> arg0,
Entry<String, Integer> arg1) {
// 降序排列
return arg0.getValue() - arg1.getValue();
}
});
LinkedList<String> rest = new LinkedList<String>();
for (Entry<String, Integer> entry : al) {
rest.add(entry.getKey());
}
return rest;
}
// 把record作为ancestor的后代插入树中
public void addNodes(TreeNode ancestor, LinkedList<String> record,
ArrayList<TreeNode> F1) {
if (record.size() > 0) {
while (record.size() > 0) {
String item = record.poll();
TreeNode leafnode = new TreeNode(item);
leafnode.setCount(1);
leafnode.setParent(ancestor);
ancestor.addChild(leafnode);
for (TreeNode f1 : F1) {
if (f1.getName().equals(item)) {
while (f1.getNextHomonym() != null) {
f1 = f1.getNextHomonym();
}
f1.setNextHomonym(leafnode);
break;
}
}
addNodes(leafnode, record, F1);
}
}
}
public static void main(String[] args) {
FPTree fptree = new FPTree();
fptree.setMinSuport(4);
List<List<String>> transRecords = fptree
.readTransRocords(System.getProperty("user.dir") + "\\resource\\market");
fptree.FPGrowth(transRecords, null);
}
}
可以看到main方法里,读market文件,指定要关联次数4次以上的物品,
调用FPGrowth方法输出如下:
6 薯片 鸡蛋
5 薯片 面包
5 鸡蛋 面包
4 薯片 鸡蛋 面包
5 薯片 牛奶
5 面包 牛奶
4 鸡蛋 牛奶
4 薯片 面包 牛奶
4 薯片 鸡蛋 牛奶