直接说这算法的作用吧,
上图的右边就是左边图的 ES(Embedded Subtree),相对应的ABC-1D-1-1E-1-1就是上图左边的Pre-Order-String,也是这个算法输入的数据格式,中间我们用空格隔开(A B C -1 D -1 -1 E -1 -1),-1就是表示回走。
这算法主要是用来挖掘频繁子树,也就是将一个树里的所有频繁子树找出来。原理就不说了,大家可以自己去搜下这篇论文,这是邹老师的一篇论文。这里我主要采用java来实现,所先针对GrowthElement、PrefixTree、PrefixTreeNode、ProjectedDB建立实体类,然后再递归实现搜索ES。
package iie.ucas.treeMiner.bean;
public class GrowthElement {
private String label = "";
private int attached = 0;// attached means that the GEs is attached to the
private int times = 0;// 出现的次数,跟min_sup比较
// n-th node of the prefix-tree.
public String getLabel() {
return label;
}
public int getTimes() {
return times;
}
public void setTimes(int times) {
this.times = times;
}
public void setLabel(String label) {
this.label = label;
}
public int getAttached() {
return attached;
}
public void setAttached(int attached) {
this.attached = attached;
}
public String toString(){
return "(" + label + ","+ getAttached() + "):" + getTimes();
}
}
<pre name="code" class="java">package iie.ucas.treeMiner.bean;
public class PrefixTree {
private ProjectedDB prodb;
private PrefixTreeNode[] ptnodes;
private int length = 0;
private int times = 1;
public PrefixTree(int n) {
ptnodes = new PrefixTreeNode[n];// 频繁模式的prefix的label个数<A C -1 -1>,则n为2
for (int i = 0; i < n; i++) {
ptnodes[i] = new PrefixTreeNode();
}
length = n;
}
public PrefixTreeNode getPtnode(int index) {
if (index < length)
return ptnodes[index];
else
return null;
}
public int getTimes() {
return times;
}
public void setTimes(int times) {
this.times = times;
}
public void setPtnode(int index, int numMinus, String label, int pos) {
if (index < length) {
ptnodes[index].setNumMinus(numMinus);
ptnodes[index].setLabel(label);
ptnodes[index].setPos(pos);
}
}
public String toString() {
String res = "";
for (int i = 0; i < length; i++) {
if (i == 0)
res += ptnodes[i].getLabel();
else {
res += " " + ptnodes[i].getLabel();
}
}
return res;
}
public String toLastLabelString() {
String res = "";
if(length == 1)
return ptnodes[0].getLabel();
for (int i = 0; i < length; i++) {
if (i == 0) {
res += ptnodes[i].getLabel();
for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
res += " " + "-1";
}
} else if (i == length - 1) {
res += " " + ptnodes[i].getLabel();
} else {
res += " " + ptnodes[i].getLabel();
for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
res += " " + "-1";
}
}
}
return res;
}
public String toString(boolean isComplete) {
String res = "";
for (int i = 0; i < length; i++) {
if (i == 0) {
res += ptnodes[i].getLabel();
for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
res += " " + "-1";
}
} else {
res += " " + ptnodes[i].getLabel();
for (int j = 0; j < ptnodes[i].getNumMinus(); j++) {
res += " " + "-1";
}
}
}
return res;
}
public int getLength() {
return length;
}
public void setLength(int length) {
this.length = length;
}
public ProjectedDB getProdb() {
return prodb;
}
public void setProdb(ProjectedDB prodb) {
this.prodb = prodb;
}
public PrefixTreeNode[] getPtnode() {
return ptnodes;
}
public void setPtnode(PrefixTreeNode[] ptnodes) {
this.ptnodes = ptnodes;
}
}
package iie.ucas.treeMiner.bean;
public class PrefixTreeNode {
private String label = "";
private int pos = 0;
private int numMinus = 0;// 用来表示在下一个label前着-1的个数
public int getNumMinus() {
return numMinus;
}
public void setNumMinus(int numMinus) {
this.numMinus = numMinus;
}
public String getLabel() {
return label;
}
public void setLabel(String label) {
this.label = label;
}
public int getPos() {
return pos;
}
public void setPos(int pos) {
this.pos = pos;
}
}
package iie.ucas.treeMiner.bean;
import java.util.ArrayList;
public class ProjectedDB {
private ArrayList<String> pre_order_list;// 用来存放ProDB,eg.{A B C -1 -1- 1;A B
// B -1 D -1 -1 -1}
public ProjectedDB() {
pre_order_list = new ArrayList<String>();
}
public ArrayList<String> getPre_order_list() {
return pre_order_list;
}
public void setPre_order_list(ArrayList<String> preOrderList) {
pre_order_list = preOrderList;
}
}
package iie.ucas.treeMiner.action;
import iie.ucas.treeMiner.bean.GrowthElement;
import iie.ucas.treeMiner.bean.PrefixTree;
import iie.ucas.treeMiner.bean.ProjectedDB;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map.Entry;
public class PrefixTreeESpanAction {
/**
* 按行读取数据,并存放到ArrayList中
*
* @param path
* @return
* @throws IOException
* @throws IOException
*/
public ArrayList<String> file2arrlist(String path) throws IOException,
IOException {
ArrayList<String> arrlist = new ArrayList<String>();
File datafile = new File(path);
String line = "";
BufferedReader input = new BufferedReader(new InputStreamReader(
new FileInputStream(datafile), "utf-8"));
while ((line = input.readLine()) != null) {
arrlist.add(line);
}
input.close();
return arrlist;
}
/**
* 统计arrlist中每个label出现的次数
*
* @param arrlist
* @return
*/
public HashMap<String, Integer> getAllLabel(ArrayList<String> arrlist) {
HashMap<String, Integer> hashmap = new HashMap<String, Integer>();
for (int i = 0; i < arrlist.size(); i++) {
String[] labels = arrlist.get(i).split(" ");// 返回第i行的所有label
for (int j = 0; j < labels.length; j++) {
if (!labels[j].equals("-1")) {// 筛选所有label
if (hashmap.containsKey(labels[j])) {// label已经存在,直接在相应的value上加1
hashmap.put(labels[j], hashmap.get(labels[j]) + 1);
} else {
hashmap.put(labels[j], 1);
}
}
}
}
return hashmap;
}
/**
* 根据prefixtree构造相应的prodb
*
* @param prodb
* @param pretree
* @return
*/
public ProjectedDB getProjectedDB(ProjectedDB prodb, PrefixTree pretree) {// suppose
// pretree:<A(1)
// -1>
ProjectedDB resprodb = new ProjectedDB();
ArrayList<String> pretlist = new ArrayList<String>();
String label = pretree.getPtnode()[pretree.getLength() - 1].getLabel();
System.out.println("<" + pretree.toString(true) + ">-projectedDB");
ArrayList<String> prodblist = prodb.getPre_order_list();
for (int i = 0; i < prodblist.size(); i++) {
String[] labels = prodblist.get(i).split(" ");
int length = labels.length;
for (int k = pretree.getLength()-1; k < length; k++) {
String prodbElement = label;//pretree.toLastLabelString();///??????????????????????????????????????
//System.out.println("lastlabelString :" + prodbElement);
// String prodbElement = label;// 表示一行such as.{A C E -1 -1 -1}
// System.out.println("prodbElement:" + prodbElement);
if (label.equals(labels[k])) {// find all occurrence of label
// ,and
// construct prodb
int contLabel = 1;//pretree.getLength();//????????
int minus = 0;// 用来标记后面出现-1的个数
for (int j = k+1; j < length; j++) {//从根label一样的那个位置的下一个位置开始
if (!labels[j].equals("-1")) {// not -1
contLabel++;
prodbElement += " " + labels[j];
} else {
minus++;
prodbElement += " " + labels[j];
if (minus >= contLabel) {//
//k = j + 1;
break;
}
}
}// break
System.out.println("prodbElement:" + prodbElement);
pretlist.add(prodbElement);// prodb的一项内容,也就是一行
}// end{if}
}
}
resprodb.setPre_order_list(pretlist);
return resprodb;
}
/**
* 返回pretree对应的 GrowthElements
*
* @param prodb
* @param min_sup
* @param pretree
* @return
*/
public HashSet<GrowthElement> getGEs(ProjectedDB prodb, PrefixTree pretree) {
// ProjectedDB pdb = this.getProjectedDB(prodb, pretree);
ArrayList<String> arrprodb = prodb.getPre_order_list();// pdb.getPre_order_list();
HashSet<GrowthElement> geset = new HashSet<GrowthElement>();
int prelength = pretree.getLength();
for (int i = 0; i < arrprodb.size(); i++) {
String labels[] = arrprodb.get(i).split(" ");
for (int j = prelength; j < labels.length; j++) {
// if (prelength == 1) {
//
// } else {// pretree.getLength()!=1
int contLabel = 0;
int minus = 0;
if (!labels[j].equals("-1")) {
if (contLabel >= minus) {// GE attached to
// pretree.getLength()
GrowthElement tmpge = null;
boolean flag = false;// 用来表示新的GE是否之前已经找到
Iterator<GrowthElement> it = geset.iterator();
while (it.hasNext()) {
tmpge = it.next();
if (tmpge.getLabel().equals(labels[j])
&& tmpge.getAttached() == prelength) {
flag = true;
break;
}
}// break
if (flag) {// 该GE之前就找到过了
tmpge.setTimes(tmpge.getTimes() + 1);
geset.add(tmpge);
} else {
GrowthElement ge = new GrowthElement();
ge.setAttached(prelength);
ge.setLabel(labels[j]);
ge.setTimes(1);
geset.add(ge);
}
} else {// //
GrowthElement tmpge = null;
boolean flag = false;// 用来表示新的GE是否之前已经找到
Iterator<GrowthElement> it = geset.iterator();
while (it.hasNext()) {
tmpge = it.next();
if (tmpge.getLabel().equals(labels[j])
&& tmpge.getAttached() == prelength) {
flag = true;
break;
}
}// break
if (flag) {// 该GE之前就找到过了
tmpge.setTimes(tmpge.getTimes() + 1);
geset.add(tmpge);
} else {
GrowthElement ge = new GrowthElement();
ge.setAttached(prelength + (contLabel - minus));
ge.setLabel(labels[j]);
ge.setTimes(1);
geset.add(ge);
}
}
contLabel++;
} else {// labels[j].equals("-1")
minus++;
}
}
}
return geset;
}
public PrefixTree extendSByGE(GrowthElement ge, PrefixTree pretree,
int min_sup) {
PrefixTree respret = null;
int length = pretree.getLength();
if (ge.getTimes() >= min_sup) {// 判断是否符合大于min_sup
if (1 == length) {
respret = new PrefixTree(length + 1);
respret.setPtnode(0, 0, pretree.getPtnode(0).getLabel(), 1);
respret.setPtnode(1, 2, ge.getLabel(), 2);
} else {
respret = new PrefixTree(length + 1);
int attached = ge.getAttached();
if (attached == length) {
for (int i = 0; i < length; i++) {// 将之前的信息复制过来eg.{A B -1
// -1}GE是(C,2)则{A B C -1
// -1 -1}
respret.setPtnode(i, 0,
pretree.getPtnode(i).getLabel(), i + 1);
}
respret.setPtnode(length, length + 1, ge.getLabel(),
length + 1);// 设置新加入的GE的信息
} else {// ***************************************************************************88//
for (int i = 0; i < length - 1; i++) {// 将之前的信息复制过来eg.{A B
// -1 -1}GE是(C,2)则{A
// B C -1 -1 -1}
respret.setPtnode(i,
pretree.getPtnode(i).getNumMinus(), pretree
.getPtnode(i).getLabel(), pretree
.getPtnode(i).getPos());
}
// int index,int numMinus, String label,int pos
respret.setPtnode(length - 1, (length - attached), pretree
.getPtnode(length - 1).getLabel(), pretree
.getPtnode(length - 1).getPos());
respret.setPtnode(length, attached + 1, ge.getLabel(),
length + 1);// 设置新加入的GE的信息
}
}
} else {// end{if}判断是否符合大于min_sup
return null;
}
System.err.println("添加GE后的prefix:" + respret.toString(true));
return respret;
}
public void Fre(PrefixTree pretree, ProjectedDB prodb, int min_sup) {
HashSet<GrowthElement> geSet = this.getGEs(prodb, pretree);
Iterator<GrowthElement> it = geSet.iterator();
while (it.hasNext()) {
GrowthElement ge = it.next();
System.out.println("ge: " + ge.toString());
Fre(this.extendSByGE(ge, pretree, min_sup), this.getProjectedDB(
prodb, this.extendSByGE(ge, pretree, min_sup)), min_sup);
}
return ;
}
public static void main(String[] args) throws IOException {
// PrefixTreeESpanAction ptes = new PrefixTreeESpanAction();
// ArrayList<String> arrlist = ptes
// .file2arrlist("C:\\Users\\Fernando\\Desktop\\treedata\\CSlog.data");// 原始数据
// HashMap<String, Integer> all_label = ptes.getAllLabel(arrlist);
// System.out.println("所有label数量: " + all_label.size());
// ProjectedDB basedb = new ProjectedDB();
// basedb.setPre_order_list(arrlist);// 最先的原始proDB
// PrefixTree pretrees[] = new PrefixTree[all_label.size()];// {a -1},{b
// // -1}....
// for (int i = 0; i < pretrees.length; i++) {
// pretrees[i] = new PrefixTree(1);// 生成长度为1的prefixtree
// }
// Iterator iter = all_label.entrySet().iterator();
/ int index = 0;
/// while (iter.hasNext()) {
// Entry entry = (Entry) iter.next();
// pretrees[index].setLength(1);
// pretrees[index].getPtnode(0).setLabel((String) entry.getKey());
// pretrees[index].getPtnode(0).setPos(1);
// pretrees[index].getPtnode(0).setNumMinus(1);
// index++;
/// }
// int min_sup = 1;
// for (int i = 0; i < pretrees.length; i++) {
// ProjectedDB ptdb = ptes.getProjectedDB(basedb, pretrees[i]);
// ptes.Fre(pretrees[i],ptdb, min_sup);
// HashSet<GrowthElement> geSet = ptes.getGEs(ptdb, pretrees[i]);
// Iterator<GrowthElement> it = geSet.iterator();
// while (it.hasNext()) {
// GrowthElement ge = it.next();
// System.out.println("ge: " + ge.toString());
// PrefixTree induce_pre = ptes.extendSByGE(ge, pretrees[i],
// min_sup);
// ProjectedDB induceDB = ptes.getProjectedDB(ptdb, induce_pre) ;
// HashSet<GrowthElement> geSet1 = ptes.getGEs(induceDB,
// induce_pre);
// Iterator<GrowthElement> it1 = geSet1.iterator();
// while (it1.hasNext()) {
// GrowthElement ge1 = it1.next();
// // System.out.println("ge1: " + ge.toString());
// PrefixTree induce_pre1 = ptes.extendSByGE(ge1, induce_pre,
// min_sup);
// }
// System.out.println("======================");
// }
}
// ProjectedDB ptdb = new ProjectedDB();
// for (int i = 0; i < pretrees.length; i++) {
// //{A -1}-proDB
// ptdb = ptes.getProjectedDB(basedb, pretrees[i]);// /记录下来下次用得到
// HashSet<GrowthElement> geSet = ptes.getGEs(ptdb, pretrees[i]);
// Iterator<GrowthElement> it = geSet.iterator();
// while (it.hasNext()) {
// GrowthElement ge = it.next();
// System.out.println("ge: " + ge.toString());
// PrefixTree induce_pre = ptes.extendSByGE(ge, pretrees[i], min_sup);
// ProjectedDB induceDB = ptes.getProjectedDB(ptdb, induce_pre) ;
// HashSet<GrowthElement> inducegeSet = ptes.getGEs(induceDB,
// pretrees[i]);
// }
//
// }
// ===================================================================================================
// PrefixTree pretree = new PrefixTree(1);
// pretree.getPtnode()[0].setLabel("72");
// pretree.getPtnode()[0].setPos(1);
// // pretree.getPtnode()[0].setContainMinus(true);//<72(1)
// // -1>,也就是第一个位置的标签是72
// // ***********//
// // ProjectedDB prodb1 = ptes.getProjectedDB(prodb, pretree);
// // ***********//
// HashSet<GrowthElement> hashset = ptes.getGEs(prodb, pretree);
// Iterator<GrowthElement> it = hashset.iterator();
// System.out.println("GEs:");
// while (it.hasNext()) {
// GrowthElement ge = it.next();
// System.out.println("ge: " + ge.toString());
// ptes.extendSByGE(ge, pretree, 2);
// }
// ArrayList<String> treedata = new ArrayList<String>();
// File datafile = new File(
// "C:\\Users\\Fernando\\Desktop\\treedata\\test.txt");
// BufferedReader input = new BufferedReader(new InputStreamReader(
// new FileInputStream(datafile), "utf-8"));
//
// String line = "";
//
// while ((line = input.readLine()) != null) {
// treedata.add(line);
// }
// // treedata = buffer.toString();
// input.close();
//
// System.out.println("lines:" + treedata.size());
//
// String data[] = treedata.get(0).split(" ");
// System.out.println(data[1].equals("-1"));
// System.out.println(data[0] + ":" + data[1]);
// HashSet<GrowthElement> geset = new HashSet<GrowthElement>();
// GrowthElement ge = new GrowthElement();
// geset.add(ge);
// ge.setAttached(11);
// geset.add(ge);
// System.out.println(geset.size());
}
}