电影数据集关联分析及FP-Growth实现

(1)数据预处理

我们先对数据集进行观察,其属性为’movieId’         ‘title’ ‘genres’,其中’movieId’为电影的序号,但并不完整,‘title’为电影名称及年份,‘genres’为电影的分类标签。因此电影的分类标签可以作为我们研究此数据集关联分析的文本数据。

我们可以看到电影的分类标签在同一个电影下不只有一个,且用’|’分开,因此我们对数据进行以下处理:

import pandas as pd
import csv
with open("movies.csv", mode="r", encoding='gb18030', errors='ignore') as file:
    csv_reader = csv.reader(file)
    next(csv_reader) # 跳过表头
    li = []
    for row in csv_reader:
        li.append(row[2].split("|"))

导入必要库,读取csv第三列去表头的文件数据,并且进行文本分割,将分割完的数据存储进列表里,作为后面算法进行关联分析的数据集。下图是处理完的数据集部分数据:

(2)代码

import pandas as pd # 导入必要库
import csv
from itertools import combinations

li = []
k = 0
with open("movies.csv", mode="r", encoding='gb18030', errors='ignore') as file:
    csv_reader = csv.reader(file)
    next(csv_reader) # 跳过表头
    for row in csv_reader:
        
        li.append(row[2].split("|")) # 处理第三列数据
        

# print(li)
# 设置最小支持度和最小置信度阈值
min_support = 0.05
min_confidence = 0.5
# 统计每个项的支持度
item_support = {}
for transaction in li:
    for item in transaction:
        if item not in item_support:
            item_support[item] = 0
        item_support[item] += 1
# 计算总事务数
total_transactions = len(li)
# print(item_support)
# 计算频繁项集
frequent_itemsets = {}
for item, support in item_support.items():
    if support / total_transactions >= min_support: # 即该项集在事务数据库中出现
        frequent_itemsets[(item,)] = support / total_transactions
# 生成候选项集并迭代生成频繁项集
k = 2
while True:
    candidates = set() # 存储所有可能的项集
    for itemset in frequent_itemsets.keys():
        for item in itemset:
            candidates.add(item)
    # 生成候选项集
    candidates = list(combinations(candidates, k)) # 生成所有可能的k项集
    # 统计候选项集的支持度
    candidate_support = {}
    for transaction in li:
        for candidate in candidates:
            if set(candidate).issubset(set(transaction)):
                if candidate not in candidate_support:
                    candidate_support[candidate] = 0
                candidate_support[candidate] += 1
    # 更新频繁项集
    frequent_itemsets_k = {}
    for candidate, support in candidate_support.items():
        if support / total_transactions >= min_support:
            frequent_itemsets_k[candidate] = support / total_transactions
    # 如果没有频繁项集则停止迭代
    if not frequent_itemsets_k:
        break
    frequent_itemsets.update(frequent_itemsets_k)
    k += 1
# print(frequent_itemsets)
# 生成关联规则
rules = []
for itemset in frequent_itemsets.keys():
    if len(itemset) >= 2:
        for i in range(1, len(itemset)):
            for combination in combinations(itemset, i):
                X = combination
                Y = tuple(set(itemset) - set(combination))
                confidence = frequent_itemsets[itemset] / frequent_itemsets[X]
                if confidence >= min_confidence:
                    rules.append((X, Y, frequent_itemsets[itemset], confidence))
    # return frequent_itemsets, rules

print("频繁项集和对应的支持度:")
for itemset, support in frequent_itemsets.items():
    print("{}: Support = {:.2f}".format(itemset, support))
# 输出关联规则和置信度
print("\n关联规则和置信度:")
for X, Y, support, confidence in rules:
    print("{} => {}: Support = {:.2f}, Confidence = {:.2f}".format(X, Y, support, confidence))

(3)输出结果截图

(4) FP-Growth

import pandas as pd # 导入必要库
import csv
from itertools import combinations

li = []
k = 0
with open("movies.csv", mode="r", encoding='gb18030', errors='ignore') as file:
    csv_reader = csv.reader(file)
    next(csv_reader) # 跳过表头
    for row in csv_reader:
        
        li.append(row[2].split("|")) # 处理第三列数据
        

# print(li)
# 设置最小支持度和最小置信度阈值
min_support = 0.05
min_confidence = 0.5
# 统计每个项的支持度
item_support = {}
for transaction in li:
    for item in transaction:
        if item not in item_support:
            item_support[item] = 0
        item_support[item] += 1
# 计算总事务数
total_transactions = len(li)
# print(item_support)
# 计算频繁项集
frequent_itemsets = {}
for item, support in item_support.items():
    if support / total_transactions >= min_support: # 即该项集在事务数据库中出现
        frequent_itemsets[(item,)] = support / total_transactions
# 生成候选项集并迭代生成频繁项集
k = 2
while True:
    candidates = set() # 存储所有可能的项集
    for itemset in frequent_itemsets.keys():
        for item in itemset:
            candidates.add(item)
    # 生成候选项集
    candidates = list(combinations(candidates, k)) # 生成所有可能的k项集
    # 统计候选项集的支持度
    candidate_support = {}
    for transaction in li:
        for candidate in candidates:
            if set(candidate).issubset(set(transaction)):
                if candidate not in candidate_support:
                    candidate_support[candidate] = 0
                candidate_support[candidate] += 1
    # 更新频繁项集
    frequent_itemsets_k = {}
    for candidate, support in candidate_support.items():
        if support / total_transactions >= min_support:
            frequent_itemsets_k[candidate] = support / total_transactions
    # 如果没有频繁项集则停止迭代
    if not frequent_itemsets_k:
        break
    frequent_itemsets.update(frequent_itemsets_k)
    k += 1
# print(frequent_itemsets)
# 生成关联规则
rules = []
for itemset in frequent_itemsets.keys():
    if len(itemset) >= 2:
        for i in range(1, len(itemset)):
            for combination in combinations(itemset, i):
                X = combination
                Y = tuple(set(itemset) - set(combination))
                confidence = frequent_itemsets[itemset] / frequent_itemsets[X]
                if confidence >= min_confidence:
                    rules.append((X, Y, frequent_itemsets[itemset], confidence))
    # return frequent_itemsets, rules

print("频繁项集和对应的支持度:")
for itemset, support in frequent_itemsets.items():
    print("{}: Support = {:.2f}".format(itemset, support))
# 输出关联规则和置信度
print("\n关联规则和置信度:")
for X, Y, support, confidence in rules:
    print("{} => {}: Support = {:.2f}, Confidence = {:.2f}".format(X, Y, support, confidence))

  • 4
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
FP-growth算法是一种非常常用的关联分析算法,可以用于挖掘数据集中的频繁项集,进而发现数据集中不同项之间的关联关系。C++作为一种高效的编程语言,也可以用于实现FP-growth算法。 下面是一个基于C++类的FP-growth算法实现及案例示例: ```cpp #include <iostream> #include <fstream> #include <algorithm> #include <vector> #include <map> using namespace std; class Node { public: string name; int count; Node* parent; map<string, Node*> children; Node() { name = ""; count = 0; parent = NULL; } Node(string name, int count) { this->name = name; this->count = count; parent = NULL; } void inc(int num) { count += num; } }; class FPTree { public: Node* root; map<string, int> headerTable; FPTree() { root = new Node(); headerTable.clear(); } void insert(vector<string>& transaction) { Node* cur = root; for (int i = 0; i < transaction.size(); i++) { string item = transaction[i]; if (cur->children.count(item) == 0) { cur->children[item] = new Node(item, 1); cur->children[item]->parent = cur; if (headerTable.count(item) == 0) { headerTable[item] = 1; } else { headerTable[item]++; } } else { cur->children[item]->count++; } cur = cur->children[item]; } } }; class FPGrowth { public: FPTree* tree; map<string, int> items; vector<vector<string>> transactions; FPGrowth() { tree = NULL; } void loadTransactions(string filename) { ifstream fin(filename); if (!fin.is_open()) { return; } string line; while (getline(fin, line)) { vector<string> transaction; string item; for (int i = 0; i < line.size(); i++) { if (line[i] == ' ') { if (items.count(item) == 0) { items[item] = 1; } else { items[item]++; } transaction.push_back(item); item = ""; } else { item += line[i]; } } if (!item.empty()) { if (items.count(item) == 0) { items[item] = 1; } else { items[item]++; } transaction.push_back(item); } transactions.push_back(transaction); } fin.close(); } bool cmp(const pair<string, int>& a, const pair<string, int>& b) { return a.second > b.second; } void buildTree() { tree = new FPTree(); for (int i = 0; i < transactions.size(); i++) { vector<string>& transaction = transactions[i]; sort(transaction.begin(), transaction.end(), [&](string a, string b) { return items[a] > items[b]; }); tree->insert(transaction); } } void findPrefixPath(string item, Node* node, vector<Node*>& prefixPath) { while (node != tree->root) { if (node->name == item) { prefixPath.push_back(node); } node = node->parent; } } void mineFrequentItemsets(int minSup) { vector<pair<string, int>> freqItems; for (auto it = items.begin(); it != items.end(); it++) { if (it->second >= minSup) { freqItems.push_back(*it); } } sort(freqItems.begin(), freqItems.end(), cmp); for (int i = 0; i < freqItems.size(); i++) { vector<string> prefix; prefix.push_back(freqItems[i].first); int sup = freqItems[i].second; findPrefixPaths(prefix, tree->headerTable, sup); } } void findPrefixPaths(vector<string>& prefix, map<string, Node*> headerTable, int sup) { string item = prefix[prefix.size() - 1]; Node* node = headerTable[item]->parent; vector<Node*> prefixPath; while (node != tree->root) { prefixPath.clear(); findPrefixPath(item, node, prefixPath); vector<string> subPrefix; for (int i = 0; i < prefix.size() - 1; i++) { subPrefix.push_back(prefix[i]); } subPrefix.push_back(node->name); int count = node->count; for (int i = 0; i < prefixPath.size(); i++) { count = min(count, prefixPath[i]->count); } if (count >= sup) { cout << "{"; for (int i = 0; i < subPrefix.size(); i++) { cout << subPrefix[i] << " "; } cout << item << "} : " << count << endl; findPrefixPaths(subPrefix, node->children, sup); } node = node->parent; } } }; int main() { FPGrowth fpg; fpg.loadTransactions("transactions.txt"); fpg.buildTree(); fpg.mineFrequentItemsets(2); return 0; } ``` 上述代码实现了一个基于类的FP-growth算法,并且支持从文件中加载交易数据,并挖掘出频繁项集。其中,`Node`类表示FP树中的节点,`FPTree`类表示FP树,`FPGrowth`类表示FP-growth算法。具体实现细节可以参考代码注释。 需要注意的是,本示例中的实现仅支持从文件中加载交易数据,并不支持在线实时插入交易数据,如果需要支持在线插入数据,需要对代码进行一定的修改。另外,本示例中的实现也没有进行过多的优化,不适用于大型数据集的挖掘。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值