关于数据挖掘Apriori算法的实现（个人作业）

唥雨凊

于 2020-10-19 15:34:23 发布

阅读量506

点赞数 1

分类专栏：笔记文章标签： java 数据挖掘

本文链接：https://blog.csdn.net/weixin_44777669/article/details/109161852

版权

笔记专栏收录该内容

29 篇文章 1 订阅

订阅专栏

关于数据挖掘Apriori算法的实现（个人作业）

关于Apriori算法的理解可以查看下面的文章

https://blog.csdn.net/huihuisd/article/details/86489810

https://blog.csdn.net/sky88088/article/details/51756415

关于最大频繁项集，我个人的理解就是，一个频繁项集的所有超集都不是频繁项集（这在后面算法中的实现很重要）

/*
 * minconfidence=60%
 * minsupport=40%
 *
 * 得到最大频繁数集{ABCD，BCE}
 * 编辑一个结果对象resultData最后打印出来
 *  https://blog.csdn.net/huihuisd/article/details/86489810
 *  https://blog.csdn.net/sky88088/article/details/51756415
 *
 * @author Programmer Li
 */
public class aprioriAlgorithm {
    //迭代次数
    public static int times = 0;
    public static int nums = 1;
    //最小支持度百分比
    private static final double MIN_SUPPROT = 0.4;
    //最小置信度
    private static final double MIN_CONFIDENCE = 0.6;
    //循环状态，迭代标识
    private static boolean endTag = false;
    //数据集
    static List<List<String>> record = new ArrayList<List<String>>();
    //存储所有的频繁项集
    static List<List<String>> frequentItemset = new ArrayList<List<String>>();
    //存储所有的最大频繁项集
    static List<List<String>> MaximumFrequentItemSet = new ArrayList<List<String>>();
    //存放频繁项集和对应的支持度技术
    static List<Mymap> map = new ArrayList<Mymap>();
    //方法三: 定义并初始化str ；
    static String[][] str = {{"A", "B", "C", "D"},
            {"B", "C", "E",},
            {"A", "B", "C", "E"},
            {"B", "D", "E",},
            {"A", "B", "C", "D"}};

    /**
     * 数据集
     * A,B,C,D
     * B,C,E
     * A,B,C,E
     * B,D,E
     * A,B,C,D
     */

    static {
        for (String[] row : str) {
            List<String> data = new ArrayList<String>();
            for (String s : row) {
                data.add(s);
            }
            record.add(data);
        }
        Apriori();
        MaximumFrequentItemSet = foundMaximumFrequentItemSet(frequentItemset);

    }

    /**
     * 实现apriori算法,首先要定义一个list string二维集合，作为实验数据
     * 首先我们要做的是第一次迭代，扫描所有的事物，对每个项进行计数得到候选项集
     * 然后通过每一次迭代获取频繁项集的集合
     */
    public static void Apriori() {
        //************获取候选1项集**************
        System.out.println("第一次扫描后的1级 备选集CandidateItemset");
        List<List<String>> CandidateItemset = findFirstCandidate();
        //************获取频繁1项集***************
        System.out.println("第一次扫描后的1级 频繁集FrequentItemset");
        List<List<String>> FrequentItemset = getSupprotedItemset(CandidateItemset);
        AddToFrequenceItem(FrequentItemset);//添加到所有的频繁项集中
        //控制台输出1项频繁集

        //*****************************迭代过程**********************************
        times = 2;
        while (endTag != true) {

            System.out.println("*******************************第" + times + "次扫描后备选集");
            //**********连接操作****获取候选times项集**************
            List<List<String>> nextCandidateItemset = getNextCandidate(FrequentItemset);
            /**************计数操作***由候选k项集选择出频繁k项集****************/
            System.out.println("*******************************第" + times + "次扫描后频繁集");
            List<List<String>> nextFrequentItemset = getSupprotedItemset(nextCandidateItemset);
            AddToFrequenceItem(nextFrequentItemset);//添加到所有的频繁项集中
            FrequentItemset = nextFrequentItemset;
            times++;//迭代次数加一
        }
    }

    /**
     * 获得一项候选集  foundMaximumFrequentItemSet
     *
     * @return
     */
    private static List<List<String>> findFirstCandidate() {
        List<List<String>> tableList = new ArrayList<List<String>>();
        HashSet<String> hs = new HashSet<String>();//新建一个hash表，存放所有的不同的一维数据
        for (int i = 0; i < record.size(); i++) {  //遍历所有的数据集，找出所有的不同的商品存放到hs中
            for (int j = 0; j < record.get(i).size(); j++) {
                hs.add(record.get(i).get(j));
            }
        }
        Iterator<String> itr = hs.iterator();
        while (itr.hasNext()) {
            List<String> tempList = new ArrayList<String>();
            String Item = (String) itr.next();
            tempList.add(Item);   //将每一种商品存放到一个List<String>中
            tableList.add(tempList);//所有的list<String>存放到一个大的list中
        }
        return tableList;//返回所有的商品
    }


    /**
     * 
     * ****************************************************** 有当前频繁项集自连接求下一次候选集
     */
    private static List<List<String>> getNextCandidate(List<List<String>> FrequentItemset) {
        List<List<String>> nextCandidateItemset = new ArrayList<List<String>>();

        for (int i = 0; i < FrequentItemset.size(); i++) {
            HashSet<String> hsSet = new HashSet<String>();
            HashSet<String> hsSettemp = new HashSet<String>();
            for (int k = 0; k < FrequentItemset.get(i).size(); k++)//获得频繁集第i行
            {
                hsSet.add(FrequentItemset.get(i).get(k));
            }
            int hsLength_before = hsSet.size();//添加前长度
            hsSettemp = (HashSet<String>) hsSet.clone();
            for (int h = i + 1; h < FrequentItemset.size(); h++) {//频繁集第i行与第j行(j>i)连接   每次添加且添加一个元素组成    新的频繁项集的某一行，
                hsSet = (HashSet<String>) hsSettemp.clone();//！！！做连接的hasSet保持不变
                for (int j = 0; j < FrequentItemset.get(h).size(); j++) {
                    hsSet.add(FrequentItemset.get(h).get(j));
                }
                int hsLength_after = hsSet.size();
                if (hsLength_before + 1 == hsLength_after && isnotHave(hsSet, nextCandidateItemset)) {
                    //如果不相等，表示添加了1个新的元素       同时判断其不是候选集中已经存在的一项
                    Iterator<String> itr = hsSet.iterator();
                    List<String> tempList = new ArrayList<String>();
                    while (itr.hasNext()) {
                        String Item = (String) itr.next();
                        tempList.add(Item);
                    }
                    nextCandidateItemset.add(tempList);
                }

            }

        }
        return nextCandidateItemset;
    }

    /**
     * 查询最大频繁项集 foundMaximumFrequentItemSet
     *
     * @param FrequentItemset
     * @return
     */
    private static List<List<String>> foundMaximumFrequentItemSet(List<List<String>> FrequentItemset) {
        List<List<String>> temp = new ArrayList<>();
        //倒序循环
        for (int i = frequentItemset.size() - 1; i >= 0; i--) {
            boolean flag = true;//用于判断是否是最大频繁项集
            List<String> strings = frequentItemset.get(i);
            for (int i1 = frequentItemset.size() - 1; strings.size() < frequentItemset.get(i1).size(); i1--) {
                if (frequentItemset.get(i1).containsAll(strings)) {
                    flag = false;
                    break;
                }
            }
            if (flag) {
                temp.add(strings);
            }
        }
        return temp;
    }

    public static boolean AddToFrequenceItem(List<List<String>> fre) {

        for (int i = 0; i < fre.size(); i++) {
            frequentItemset.add(fre.get(i));
        }
        return true;
    }

    /**
     * 由k项候选集剪枝得到k项频繁集
     *
     * @param CandidateItemset
     * @return
     */
    private static List<List<String>> getSupprotedItemset(List<List<String>> CandidateItemset) { //对所有的商品进行支持度计数
        boolean end = true;
        List<List<String>> supportedItemset = new ArrayList<List<String>>();

        for (int i = 0; i < CandidateItemset.size(); i++) {
            System.out.println(CandidateItemset.get(i));
            int count = countFrequent1(CandidateItemset.get(i));//统计记录数

            if (count >= MIN_SUPPROT * record.size()) {
                supportedItemset.add(CandidateItemset.get(i));
                map.add(new Mymap(CandidateItemset.get(i), count));//存储当前频繁项集以及它的支持度计数
                end = false;
            }
        }
        endTag = end;//存在频繁项集则不会结束
        if (endTag == true) {
            System.out.println("*****************无满足支持度的" + times + "项集,结束连接");
        }
        return supportedItemset;
    }

    /**
     * 统计record中出现list集合的个数
     */
    private static int countFrequent1(List<String> list) {//遍历所有数据集record，对单个候选集进行支持度计数

        int count = 0;
        for (int i = 0; i < record.size(); i++)//从record的第一个开始遍历
        {
            boolean flag = true;
            for (int j = 0; j < list.size(); j++)//如果record中的第一个数据集包含list中的所有元素
            {
                String t = list.get(j);
                if (!record.get(i).contains(t)) {
                    flag = false;
                    break;
                }
            }
            if (flag) {
                count++;//支持度加一
            }
        }

        return count;//返回支持度计数

    }

    /**
     * 判断新添加元素形成的候选集是否在新的候选集中
     */
    private static boolean isnotHave(HashSet<String> hsSet, List<List<String>> nextCandidateItemset) {//判断hsset是不是candidateitemset中的一项
        List<String> tempList = new ArrayList<String>();
        Iterator<String> itr = hsSet.iterator();
        while (itr.hasNext()) {//将hsset转换为List<String>
            String Item = (String) itr.next();
            tempList.add(Item);
        }
        for (int i = 0; i < nextCandidateItemset.size(); i++)//遍历candidateitemset，看其中是否有和templist相同的一项
            if (tempList.equals(nextCandidateItemset.get(i))) {
                return false;
            }
        return true;
    }


    public static void AssociationRulesMining()//关联规则挖掘
    {
        for (int i = 0; i < MaximumFrequentItemSet.size(); i++) {
            List<String> tem = MaximumFrequentItemSet.get(i);
            if (tem.size() > 1) {
                List<String> temclone = new ArrayList<String>(tem);
                List<List<String>> AllSubset = getSubSet(temclone);//得到频繁项集tem的所有子集
                for (int j = 0; j < AllSubset.size(); j++) {
                    List<String> s1 = AllSubset.get(j);
                    List<String> s2 = gets2set(tem, s1);
                    isAssociationRules(s1, s2, tem);
                }
            }

        }
    }

    public static double isAssociationRules(List<String> s1, List<String> s2, List<String> tem)//判断是否为关联规则，我这返回的是计算结果
    {
        double confidence = 0;
        int counts1;
        int countTem;
        if (s1.size() != 0 && s1 != null && tem.size() != 0 && tem != null) {
            counts1 = getCount(s1);
            countTem = getCount(tem);
            confidence = countTem * 1.0 / counts1;
            System.out.println(nums + "   " + "关联规则：" + s1.toString() + "=>>" + s2.toString() + "   " + "置信度为：" +(double)Math.round(confidence*100)/100+"       是否关联："+(confidence>MIN_CONFIDENCE));
            nums++;
            return confidence;
        } else {
            return confidence;
        }

    }

    public static int getCount(List<String> in)//根据频繁项集得到 其支持度计数
    {
        int rt = 0;
        for (int i = 0; i < map.size(); i++) {
            Mymap tem = map.get(i);
            if (tem.isListEqual(in)) {
                rt = tem.getcount();
                return rt;
            }
        }
        return rt;

    }


    public static List<String> gets2set(List<String> tem, List<String> s1)//计算tem减去s1后的集合即为s2
    {
        List<String> result = new ArrayList<String>();

        for (int i = 0; i < tem.size(); i++)//去掉s1中的所有元素
        {
            String t = tem.get(i);
            if (!s1.contains(t)) {
                result.add(t);
            }
        }
        return result;
    }


    public static List<List<String>> getSubSet(List<String> set) {
        List<List<String>> result = new ArrayList<List<String>>();    //用来存放子集的集合，如{{},{1},{2},{1,2}}
        int length = set.size();
        int num = length == 0 ? 0 : 1 << (length);    //2的n次方，若集合set为空，num为0；若集合set有4个元素，那么num为16.

        //从0到2^n-1（[00...00]到[11...11]）
        for (int i = 1; i < num - 1; i++) {
            List<String> subSet = new ArrayList<String>();

            int index = i;
            for (int j = 0; j < length; j++) {
                if ((index & 1) == 1) {
                    //每次判断index最低位是否为1，为1则把集合set的第j个元素放到子集中
                    subSet.add(set.get(j));
                }
                index >>= 1;        //右移一位
            }

            result.add(subSet);        //把子集存储起来
        }
        return result;
    }


    public static void main(String[] args) {
        System.out.println();
        System.out.println();
        System.out.println("======================================运行结果如下===============================================");
        System.out.println("频繁项目集===>"+frequentItemset);
        System.out.println("最大频繁项目集===>"+MaximumFrequentItemSet);
        AssociationRulesMining();
    }

}

public class Mymap {//自定义的map类，一个对象存放一个频繁项集以及其支持度计数
    public List<String> li ;
    public int count;

    public Mymap(List<String> l, int c)//构造函数  新建一个对象
    {
        li = l;
        count = c;
    }

    public int getcount()//返回得到当前频繁项集的支持度计数
    {
        return count;
    }

    public boolean isListEqual(List<String> in)//判断传入的频繁项集是否和本频繁项集相同
    {
        if (in.size() != li.size())//先判断大小是否相同
        {
            return false;
        } else {
            for (int i = 0; i < in.size(); i++)//遍历输入的频繁项集，判断是否所有元素都包含在本频繁项集中
            {
                if (!li.contains(in.get(i))) {
                    return false;
                }
            }
        }
        return true;//如果两个频繁项集大小相同，同时本频繁项集包含传入的频繁项集的所有元素，则表示两个频繁项集是相等的，返回为真
    }
}

唥雨凊

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
关于数据挖掘Apriori算法的实现（个人作业）

关于数据挖掘Apriori算法的实现（个人作业）关于Apriori算法的理解可以查看下面的文章https://blog.csdn.net/huihuisd/article/details/86489810https://blog.csdn.net/sky88088/article/details/51756415关于最大频繁项集，我个人的理解就是，一个频繁项集的所有超集都不是频繁项集（这在后面算法中的实现很重要）/* * minconfidence=60% * minsupport=40%
复制链接

扫一扫

专栏目录