频繁项集挖掘算法Apriori + FpGrowth(C++代码)
Apriori算法代码
#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <map>
#include <algorithm>
#include <ctime>
#include <cstring>
using namespace std;
double minsup;
int minSupport;
vector<vector<int> > events;
const int maxn = 88888;
bool book[maxn], tbook[maxn];
map<int, int> M;
int freqItemsCount;
ofstream fout("AprioriFreqItemSet.txt");
void readEvent() {
ifstream fin("retail.dat");
int eventCount = 0;
string line;
while(getline(fin, line)) {
vector<int> event;
int num = 0;
line += ' ';
for(int i = 0; line[i] != '\0'; i++) {
if(isdigit(line[i])) { num *= 10; num += line[i] - '0'; }
else {
if(isdigit(line[i - 1])) {
event.push_back(num);
M[num]++;
}
num = 0;
}
}
sort(event.begin(), event.end());
events.push_back(event);
eventCount++;
}
minSupport = ceil(eventCount * minsup);
fin.close();
}
void generateOneItemSet(vector<pair<vector<int>, int> >& Ck) {
int eventsSize = events.size();
for(int i = 0; i < eventsSize; i++)
for(auto item : events[i]) {
if(M[item] >= minSupport) {
book[i] = true;
break;
}
}
for(auto it : M)
if(it.second >= minSupport)
Ck.push_back(make_pair(vector<int> (1, it.first), it.second));
}
bool check(vector<int>& v1, vector<int>& v2) {
int len = v1.size();
for(int i = 0; i < len - 1; i++)
if(v1[i] != v2[i]) return false;
return true;
}
int itemCount(vector<int>& item) {
int cnt = 0;
int len1 = item.size();
int eventsSize = events.size();
for(int i = 0; i < eventsSize; i++) {
if(!book[i]) continue;
int len2 = events[i].size();
if(len2 < len1) continue;
int p1 = 0, p2 = 0;
while(p1 < len1 && p2 < len2) {
if(item[p1] == events[i][p2]) p1++;
p2++;
}
if(p1 == len1) {
cnt++;
tbook[i] = true;
}
}
return cnt;
}
void Link(vector<pair<vector<int>, int> >& Ck) {
memset(tbook, false, sizeof tbook);
vector<pair<vector<int>, int> > itemSet;
int len = Ck.size();
for(int i = 0; i < len; i++) {
for(int j = i + 1; j < len; j++) {
vector<int> tmp;
if(check(Ck[i].first, Ck[j].first)) {
tmp = Ck[i].first;
tmp.push_back(Ck[j].first.back());
if(tmp[tmp.size() - 1] < tmp[tmp.size() - 2])
swap(tmp[tmp.size() - 1], tmp[tmp.size() - 2]);
if(itemCount(tmp) >= minSupport) {
itemSet.push_back(make_pair(tmp, itemCount(tmp)));
}
}
}
}
memcpy(book, tbook, sizeof book);
Ck = itemSet;
}
void showItemSet(vector<pair<vector<int> ,int> >& Ck) {
if(Ck.size()) fout << "----" << Ck[0].first.size() << "项集----\n";
for(auto it : Ck) {
fout << "{ ";
for(auto item : it.first) fout << item << " ";
fout << "}: " << it.second << endl;
}
freqItemsCount += Ck.size();
}
int main(void)
{
cout << "请输入最小支持度:";
cin >> minsup;
double startTime = clock();
vector<pair<vector<int>, int> > Ck;
readEvent();
generateOneItemSet(Ck);
showItemSet(Ck);
while(Ck.size() > 1) {
Link(Ck);
showItemSet(Ck);
}
cout << "频繁项集个数:" << freqItemsCount << endl;
cout << "频繁项集信息已保存到文件AprioriFreqItmeSet.txt" << endl;
cout << "程序总用时:" << double(clock() - startTime) / CLOCKS_PER_SEC << endl;
return 0;
}
FpGrowth算法代码
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <fstream>
#include <ctime>
#include <cstring>
#include <set>
using namespace std;
typedef vector<int> V;
typedef vector<pair<vector<int>, int> > VPV;
typedef vector<int>::iterator Vit;
int minSup; double mins;
const int maxn = 16500;
int M[maxn];
int pos[maxn];
set<V> freqItemList;
struct fpNode {
int id;
vector<fpNode*> children;
fpNode* parent;
fpNode* next;
int count;
fpNode() {
id = -1;
parent = next = nullptr;
count = 0;
}
};
typedef vector<fpNode*> headTab;
bool cmp(int a, int b) {
return M[a] > M[b];
}
void readEvent(VPV& events, V& itemSet, headTab& head) {
ifstream fin("retail.dat");
int eventCount = 0;
string line;
while(getline(fin, line)) {
V event;
int num = 0;
line += ' ';
for(int i = 0; line[i] != '\0'; i++) {
if(isdigit(line[i])) { num *= 10; num += line[i] - '0'; }
else {
if(isdigit(line[i - 1])) {
event.push_back(num);
M[num]++;
itemSet.push_back(num);
}
num = 0;
}
}
sort(event.begin(), event.end());
events.push_back(make_pair(event, 1));
eventCount++;
}
minSup = ceil(eventCount * mins);
fin.close();
sort(itemSet.begin(), itemSet.end());
itemSet.erase(unique(itemSet.begin(), itemSet.end()), itemSet.end());
sort(itemSet.begin(), itemSet.end(), cmp);
for(Vit it = itemSet.begin(); it != itemSet.end(); it++) {
if(M[*it] < minSup) {
itemSet.erase(it, itemSet.end());
break;
}
}
int itemCount = itemSet.size();
for(int i = 0; i < itemCount; i++) pos[itemSet[i]] = i;
head.resize(itemCount);
}
void updateFpTree(fpNode* inTree, headTab& head, Vit it, Vit end, int count) {
if(it == end) return;
bool flag = false;
int len = inTree->children.size();
for(int i = 0; i < len; i++) {
fpNode* child = inTree->children[i];
if(child->id == *it) {
child->count += count;
updateFpTree(child, head, it + 1, end, count);
flag = true;
break;
}
}
if(!flag) {
while(it != end) {
fpNode* tmp = new fpNode();
tmp->id = *it;
tmp->parent = inTree;
tmp->count = count;
tmp->next = head[pos[*it]];
head[pos[*it]] = tmp;
inTree->children.push_back(tmp);
inTree = tmp;
it++;
}
}
}
void createFpTree(fpNode* fpTree, VPV& events, headTab& head) {
for(auto event : events) {
stable_sort(event.first.begin(), event.first.end(), cmp);
for(Vit it = event.first.begin(); it != event.first.end(); it++) {
if(M[*it] < minSup) {
event.first.erase(it, event.first.end());
break;
}
}
updateFpTree(fpTree, head, event.first.begin(), event.first.end(), event.second);
}
}
void showFpTree(fpNode* inTree) {
cout << inTree->id << endl;
for(auto child : inTree->children) showFpTree(child);
}
vector<int> getItemSet(fpNode* fp, V S) {
if(fp->id == -1) return S;
S.push_back(fp->id);
return getItemSet(fp->parent, S);
}
void fpGrowth(headTab head, V path) {
for(int i = head.size() - 1; i >= 0; i--) {
fpNode* fp = head[i];
VPV condEvents;
V condItemSet;
headTab condHead;
memset(M, 0, sizeof M);
memset(pos, -1, sizeof pos);
while(fp != nullptr) {
V tmp;
V condEvent = getItemSet(fp->parent, tmp);
sort(condEvent.begin(), condEvent.end());
condEvents.push_back(make_pair(condEvent, fp->count));
for(auto item : condEvent) {
M[item] += fp->count;
condItemSet.push_back(item);
}
fp = fp->next;
}
sort(condItemSet.begin(), condItemSet.end());
condItemSet.erase(unique(condItemSet.begin(), condItemSet.end()), condItemSet.end());
sort(condItemSet.begin(), condItemSet.end(), cmp);
for(Vit it = condItemSet.begin(); it != condItemSet.end(); it++) {
if(M[*it] < minSup) {
condItemSet.erase(it, condItemSet.end());
break;
}
}
int itemCount = condItemSet.size();
for(int i = 0; i < itemCount; i++) pos[condItemSet[i]] = i;
condHead.resize(itemCount);
fpNode* condFpTree = new fpNode();
createFpTree(condFpTree, condEvents, condHead);
V newPath = path;
newPath.push_back(head[i]->id);
sort(newPath.begin(), newPath.end());
freqItemList.insert(newPath);
for(auto item : condItemSet) {
V tmp = newPath;
tmp.push_back(item);
sort(tmp.begin(), tmp.end());
freqItemList.insert(tmp);
}
if(condHead.size()) {
fpGrowth(condHead, newPath);
}
}
}
void showFreqItemSet() {
cout << "频繁项个数:" << freqItemList.size() << endl;
ofstream fout("FpGrowthFreqItemSet.txt");
for(auto freqItems : freqItemList) {
fout << "{ ";
for(auto item : freqItems) fout << item << " ";
fout << "}" << endl;
}
cout << "频繁项集信息已保存到文件FpGrowthFreqItemSet.txt" << endl;
fout.close();
}
int main(void)
{
cout << "请输入最小支持度:";
cin >> mins;
double startTime = clock();
VPV events;
V itemSet;
headTab head;
fpNode* fpTree = new fpNode();
readEvent(events, itemSet, head);
createFpTree(fpTree, events, head);
cout << "建树用时:" << (clock() - startTime) / CLOCKS_PER_SEC << endl;
V path;
fpGrowth(head, path);
cout << "程序总用时:" << (clock() - startTime) / CLOCKS_PER_SEC << endl;
showFreqItemSet();
return 0;
}