关联分析——FPTree的一种java实现

最新推荐文章于 2021-03-02 04:18:15 发布

cq1982

最新推荐文章于 2021-03-02 04:18:15 发布

阅读量2.1k

点赞数 1

分类专栏：数据挖掘

数据挖掘专栏收录该内容

4 篇文章 1 订阅

订阅专栏

关联分析是用来做什么的？这边有一个经典的例子“超市购物单”，文件market内容如下：

牛奶，鸡蛋，面包，薯片
鸡蛋，爆米花，薯片，啤酒
鸡蛋，面包，薯片
牛奶，鸡蛋，面包，爆米花，薯片，啤酒
牛奶，面包，啤酒
鸡蛋，面包，啤酒
牛奶，面包，薯片
牛奶，鸡蛋，面包，黄油，薯片
牛奶，鸡蛋，黄油，薯片

每一行可以看作一个购物单，关联分析就是用来分析哪些物品经常会被同时购买（也就是关联度较大）。

其中一个分析算法的java实现如下：

TreeNode节点实现

import java.util.ArrayList;
import java.util.List;

public class TreeNode implements Comparable<TreeNode> {

	private String name; // 节点名称
	private int count; // 计数
	private TreeNode parent; // 父节点
	private List<TreeNode> children; // 子节点
	private TreeNode nextHomonym; // 下一个同名节点

	public TreeNode() {

	}

	public TreeNode(String name) {
		this.name = name;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public int getCount() {
		return count;
	}

	public void setCount(int count) {
		this.count = count;
	}

	public TreeNode getParent() {
		return parent;
	}

	public void setParent(TreeNode parent) {
		this.parent = parent;
	}

	public List<TreeNode> getChildren() {
		return children;
	}

	public void addChild(TreeNode child) {
		if (this.getChildren() == null) {
			List<TreeNode> list = new ArrayList<TreeNode>();
			list.add(child);
			this.setChildren(list);
		} else {
			this.getChildren().add(child);
		}
	}

	public TreeNode findChild(String name) {
		List<TreeNode> children = this.getChildren();
		if (children != null) {
			for (TreeNode child : children) {
				if (child.getName().equals(name)) {
					return child;
				}
			}
		}
		return null;
	}

	public void setChildren(List<TreeNode> children) {
		this.children = children;
	}

	public void printChildrenName() {
		List<TreeNode> children = this.getChildren();
		if (children != null) {
			for (TreeNode child : children) {
				System.out.print(child.getName() + " ");
			}
		} else {
			System.out.print("null");
		}
	}

	public TreeNode getNextHomonym() {
		return nextHomonym;
	}

	public void setNextHomonym(TreeNode nextHomonym) {
		this.nextHomonym = nextHomonym;
	}

	public void countIncrement(int n) {
		this.count += n;
	}

	@Override
	public int compareTo(TreeNode arg0) {
		// TODO Auto-generated method stub
		int count0 = arg0.getCount();
		// 跟默认的比较大小相反，导致调用Arrays.sort()时是按降序排列
		return count0 - this.count;
	}
}

FPTree实现

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

public class FPTree {

	private int minSuport;

	public int getMinSuport() {
		return minSuport;
	}

	public void setMinSuport(int minSuport) {
		this.minSuport = minSuport;
	}

	// 从若干个文件中读入Transaction Record
	public List<List<String>> readTransRocords(String... filenames) {
		List<List<String>> transaction = null;
		if (filenames.length > 0) {
			transaction = new LinkedList<List<String>>();
			for (String filename : filenames) {
				try {
					FileReader fr = new FileReader(filename);
					BufferedReader br = new BufferedReader(fr);
					try {
						String line;
						List<String> record;
						while ((line = br.readLine()) != null) {
							if (line.trim().length() > 0) {
								String str[] = line.split("，");
								record = new LinkedList<String>();
								for (String w : str)
									record.add(w);
								transaction.add(record);
							}
						}
					} finally {
						br.close();
					}
				} catch (IOException ex) {
					System.out.println("Read transaction records failed."
							+ ex.getMessage());
					System.exit(1);
				}
			}
		}
		return transaction;
	}

	// FP-Growth算法
	public void FPGrowth(List<List<String>> transRecords,
			List<String> postPattern) {
		// 构建项头表，同时也是频繁1项集
		ArrayList<TreeNode> HeaderTable = buildHeaderTable(transRecords);
		// 构建FP-Tree
		TreeNode treeRoot = buildFPTree(transRecords, HeaderTable);
		// 如果FP-Tree为空则返回
		if (treeRoot.getChildren() == null
				|| treeRoot.getChildren().size() == 0)
			return;
		// 输出项头表的每一项+postPattern
		if (postPattern != null) {
			for (TreeNode header : HeaderTable) {
				System.out.print(header.getCount() + "\t" + header.getName());
				for (String ele : postPattern)
					System.out.print("\t" + ele);
				System.out.println();
			}
		}
		// 找到项头表的每一项的条件模式基，进入递归迭代
		for (TreeNode header : HeaderTable) {
			// 后缀模式增加一项
			List<String> newPostPattern = new LinkedList<String>();
			newPostPattern.add(header.getName());
			if (postPattern != null)
				newPostPattern.addAll(postPattern);
			// 寻找header的条件模式基CPB，放入newTransRecords中
			List<List<String>> newTransRecords = new LinkedList<List<String>>();
			TreeNode backnode = header.getNextHomonym();
			while (backnode != null) {
				int counter = backnode.getCount();
				List<String> prenodes = new ArrayList<String>();
				TreeNode parent = backnode;
				// 遍历backnode的祖先节点，放到prenodes中
				while ((parent = parent.getParent()).getName() != null) {
					prenodes.add(parent.getName());
				}
				while (counter-- > 0) {
					newTransRecords.add(prenodes);
				}
				backnode = backnode.getNextHomonym();
			}
			// 递归迭代
			FPGrowth(newTransRecords, newPostPattern);
		}
	}

	// 构建项头表，同时也是频繁1项集
	public ArrayList<TreeNode> buildHeaderTable(List<List<String>> transRecords) {
		ArrayList<TreeNode> F1 = null;
		if (transRecords.size() > 0) {
			F1 = new ArrayList<TreeNode>();
			Map<String, TreeNode> map = new HashMap<String, TreeNode>();
			// 计算事务数据库中各项的支持度
			for (List<String> record : transRecords) {
				for (String item : record) {
					if (!map.keySet().contains(item)) {
						TreeNode node = new TreeNode(item);
						node.setCount(1);
						map.put(item, node);
					} else {
						map.get(item).countIncrement(1);
					}
				}
			}
			// 把支持度大于（或等于）minSup的项加入到F1中
			Set<String> names = map.keySet();
			for (String name : names) {
				TreeNode tnode = map.get(name);
				if (tnode.getCount() >= minSuport) {
					F1.add(tnode);
				}
			}
			Collections.sort(F1);
			return F1;
		} else {
			return null;
		}
	}

	// 构建FP-Tree
	public TreeNode buildFPTree(List<List<String>> transRecords,
			ArrayList<TreeNode> F1) {
		TreeNode root = new TreeNode(); // 创建树的根节点
		for (List<String> transRecord : transRecords) {
			LinkedList<String> record = sortByF1(transRecord, F1);
			TreeNode subTreeRoot = root;
			TreeNode tmpRoot = null;
			if (root.getChildren() != null) {
				while (!record.isEmpty()
						&& (tmpRoot = subTreeRoot.findChild(record.peek())) != null) {
					tmpRoot.countIncrement(1);
					subTreeRoot = tmpRoot;
					record.poll();
				}
			}
			addNodes(subTreeRoot, record, F1);
		}
		return root;
	}

	// 把交易记录按项的频繁程序降序排列
	public LinkedList<String> sortByF1(List<String> transRecord,
			ArrayList<TreeNode> F1) {
		Map<String, Integer> map = new HashMap<String, Integer>();
		for (String item : transRecord) {
			// 由于F1已经是按降序排列的，
			for (int i = 0; i < F1.size(); i++) {
				TreeNode tnode = F1.get(i);
				if (tnode.getName().equals(item)) {
					map.put(item, i);
				}
			}
		}
		ArrayList<Entry<String, Integer>> al = new ArrayList<Entry<String, Integer>>(
				map.entrySet());
		Collections.sort(al, new Comparator<Map.Entry<String, Integer>>() {
			@Override
			public int compare(Entry<String, Integer> arg0,
					Entry<String, Integer> arg1) {
				// 降序排列
				return arg0.getValue() - arg1.getValue();
			}
		});
		LinkedList<String> rest = new LinkedList<String>();
		for (Entry<String, Integer> entry : al) {
			rest.add(entry.getKey());
		}
		return rest;
	}

	// 把record作为ancestor的后代插入树中
	public void addNodes(TreeNode ancestor, LinkedList<String> record,
			ArrayList<TreeNode> F1) {
		if (record.size() > 0) {
			while (record.size() > 0) {
				String item = record.poll();
				TreeNode leafnode = new TreeNode(item);
				leafnode.setCount(1);
				leafnode.setParent(ancestor);
				ancestor.addChild(leafnode);

				for (TreeNode f1 : F1) {
					if (f1.getName().equals(item)) {
						while (f1.getNextHomonym() != null) {
							f1 = f1.getNextHomonym();
						}
						f1.setNextHomonym(leafnode);
						break;
					}
				}

				addNodes(leafnode, record, F1);
			}
		}
	}

	public static void main(String[] args) {
		FPTree fptree = new FPTree();
		fptree.setMinSuport(4);
		List<List<String>> transRecords = fptree
				.readTransRocords(System.getProperty("user.dir") + "\\resource\\market");
		fptree.FPGrowth(transRecords, null);
	}
}

可以看到main方法里，读market文件，指定要关联次数4次以上的物品，

调用FPGrowth方法输出如下：

6	薯片	鸡蛋
5	薯片	面包
5	鸡蛋	面包
4	薯片	鸡蛋	面包
5	薯片	牛奶
5	面包	牛奶
4	鸡蛋	牛奶
4	薯片	面包	牛奶
4	薯片	鸡蛋	牛奶

cq1982

关注

1
点赞
踩
8

收藏

觉得还不错? 一键收藏
4
评论
关联分析——FPTree的一种java实现

关联分析是用来做什么的？这边有一个经典的例子“超市购物单”，文件market内容如下：牛奶，鸡蛋，面包，薯片鸡蛋，爆米花，薯片，啤酒鸡蛋，面包，薯片牛奶，鸡蛋，面包，爆米花，薯片，啤酒牛奶，面包，啤酒鸡蛋，面包，啤酒牛奶，面包，薯片牛奶，鸡蛋，面包，黄油，薯片牛奶，鸡蛋，黄油，薯片每一行可以看作一个购物单，关联分析就是用来分析哪些物品经常会被同时购买（也就是关联度较大）。
复制链接

扫一扫