三种方式
import pandas as pd
import numpy as np
from efficient_apriori import apriori
#加载数据
data = pd.read_csv('Market_Basket_Optimisation.csv', header = None)
data = data.fillna(0)
#print(data)
#将数据整理成Transaction列表
transaction = []
for i in range(data.shape[0]):
temp = set()
for j in range(data.shape[1]):
if data.iloc[i, j] != 0:
temp.add(data.iloc[i, j])
transaction.append(temp)
# print(transaction)
print('总计有{}张小票'.format(data.shape[0]),'\n')
itemsets, rules = apriori(transaction, min_support = 0.03, min_confidence = 0.3)
print('频繁项集为:\n', itemsets,'\n')
print('关联规则为:\n', rules)
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from pandas import DataFrame
#加载数据
data = pd.read_csv('Market_Basket_Optimisation.csv', header = None)
data = data.fillna(0)
#将数据整理成Transaction列表
transaction = []
for i in range(data.shape[0]):
temp = str()
for j in range(data.shape[1]):
if data.iloc[i, j] != 0:
temp += str(data.iloc[i, j]) + str(',')
transaction.append(temp)
transactions = DataFrame({'Item':transaction})
one_hot = transactions.drop('Item', 1).join(transactions.Item.str.get_dummies(','))
#使用onehot数据进行关联分析
frequent = apriori(one_hot, min_support = 0.05, use_colnames = True)
rules = association_rules(frequent, metric = 'lift', min_threshold = 1)
print('总计有{}张小票'.format(one_hot.shape[0]),'\n')
print('频繁项集为:\n', frequent,'\n')
print('关联规则为:\n')
rules
import pandas as pd
import numpy as np
from efficient_apriori import apriori
#加载数据
data = pd.read_csv('c:/Users/10109/Documents/Jupyter notebook/人工智能课程(BI方向)/商业智能和推荐系统/lesson2 挖掘数据中的关联规则/homework/Market_Basket_Optimisation.csv', header = None)
data = data.fillna(0)
#print(data)
#将数据整理成Transaction列表
transaction = []
for i in range(data.shape[0]):
temp = set()
for j in range(data.shape[1]):
if data.iloc[i, j] != 0:
temp.add(data.iloc[i, j])
transaction.append(temp)
transactions = []
for i in transaction:
temp_list = []
for j in i:
temp_list.append(j)
transactions.append([temp_list])
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
import datetime
if __name__ == "__main__":
t1=datetime.datetime.now()
appname = "FPgrowth"
master ="local[4]"
#spark配置
conf = SparkConf().setAppName(appname).setMaster(master)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
#加载数据
data = transactions
#将数据转为spark中的dataframe
data = spark.createDataFrame(data, ["items"])
#模型建立
fp = FPGrowth(minSupport=0.03, minConfidence=0.2)
#模型拟合
fpm = fp.fit(data)
#在控制台显示前五条频繁项集
fpm.freqItemsets.show(10)
#强关联规则
assRule=fpm.associationRules
assRule.show(5)
#转为python中的dataframe
assRuleDf = assRule.toPandas()
#由 lift 按照降序排列
assRuleDf = assRuleDf.sort_values(by = "lift", ascending = False)
print('强关联规则:\n',assRuleDf)
#新的前项数据
new_data = spark.createDataFrame([(["milk"], )], ["items"])
#预测后项
print('后项预测:\n',fpm.transform(new_data).first().prediction)
spark.stop()#关闭spark
t2=datetime.datetime.now()
print('spent ts:',t2-t1)
#遇到 'NoneType' object has no attribute 'setCallSite' 记得 restart kernel
#接收data类型为如下格式:
# data_list=[[['r', 'z', 'h', 'k', 'p']]\
# ,[['z', 'y', 'x', 'w', 'v', 'u', 't', 's']]\
# ,[['s', 'x', 'o', 'n', 'r']]\
# ,[['x', 'z', 'y', 'm', 't', 's', 'q', 'e']]\
# ,[['z']]\
# ,[['x', 'z', 'y', 'r', 'q', 't', 'p']]]#数据集