mahout 频繁模式代码样板

频繁模式挖掘,mahout的实现是fpgrowth算法。贴样板马克一下。


/**
 * 关联规则挖掘实现<br>
 * 李国忠
 */
public final class PatternFinder extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(PatternFinder.class);
    private PatternFinder() {
    }
    public static void main(String[] args) throws Exception { 
        ToolRunner.run(new Configuration(), new PatternFinder(), args);
    }
    /**
     * Run TopK FPGrowth given the input file,
     */
    @Override
    public int run(String[] args) throws Exception {
        addInputOption();
        addOutputOption();
        addOption("minSupport", "s", "(Optional) The minimum number of times a co-occurrence must be present." + " Default Value: 3", "3");
        addOption("maxHeapSize", "k", "(Optional) Maximum Heap Size k, to denote the requirement to mine top K items." + " Default value: 50", "50");
        addOption("numGroups", "g", "(Optional) Number of groups the features should be divided in the map-reduce version." + " Doesn't work in sequential version Default Value:" + PFPGrowth.NUM_GROUPS_DEFAULT, Integer.toString(PFPGrowth.NUM_GROUPS_DEFAULT));
        addOption("splitterPattern", "regex", "Regular Expression pattern used to split given string transaction into" + " itemsets. Default value splits comma separated itemsets.  Default Value:" + " \"[ ,\\t]*[,|\\t][ ,\\t]*\" ", "[ ,\t]*[,|\t][ ,\t]*");
        addOption("numTreeCacheEntries", "tc", "(Optional) Number of entries in the tree cache to prevent duplicate" + " tree building. (Warning) a first level conditional FP-Tree might consume a lot of memory, " + "so keep this value small, but big enough to prevent duplicate tree building. " + "Default Value:5 Recommended Values: [5-10]", "5");
        addOption("method", "method", "Method of processing: sequential|mapreduce", "sequential");
        addOption("encoding", "e", "(Optional) The file encoding.  Default value: UTF-8", "UTF-8");
        addFlag("useFPG2", "2", "Use an alternate FPG implementation");
        if (parseArguments(args) == null) {
            return -1;
        }
        Parameters params = new Parameters();
        if (hasOption("minSupport")) {
            String minSupportString = getOption("minSupport");
            params.set("minSupport", minSupportString);
        }
        if (hasOption("maxHeapSize")) {
            String maxHeapSizeString = getOption("maxHeapSize");
            params.set("maxHeapSize", maxHeapSizeString);
        }
        if (hasOption("numGroups")) {
            String numGroupsString = getOption("numGroups");
            params.set("numGroups", numGroupsString);
        }
        if (hasOption("numTreeCacheEntries")) {
            String numTreeCacheString = getOption("numTreeCacheEntries");
            params.set("treeCacheSize", numTreeCacheString);
        }
        if (hasOption("splitterPattern")) {
            String patternString = getOption("splitterPattern");
            params.set("splitPattern", patternString);
        }
        String encoding = "UTF-8";
        if (hasOption("encoding")) {
            encoding = getOption("encoding");
        }
        params.set("encoding", encoding);
        if (hasOption("useFPG2")) {
            params.set(PFPGrowth.USE_FPG2, "true");
        }
        Path inputDir = getInputPath(); // new Path("/qgzz/questions.txt");
        Path outputDir = getOutputPath(); // new Path("/qgzz/q2.txt");
        params.set("input", inputDir.toString());
        params.set("output", outputDir.toString());
        String classificationMethod = getOption("method");
        if ("sequential".equalsIgnoreCase(classificationMethod)) {// 数据量还小的时候,使用内存计算关联规则
            runFPGrowth(params);
        } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) {// 数据量太大时,走hadoop并行计算
            Configuration conf = new Configuration();
            HadoopUtil.delete(conf, outputDir);
            PFPGrowth.runPFPGrowth(params);
        }
        System.out.println("---------------------over----------------------");
        return 0;
    }
    /**
     * 在hadoop内存中运算,虽然不是内存运算,但是文件放在hdfs上
     */
    private static void runFPGrowth(Parameters params) throws IOException {
        log.info("Starting Sequential FPGrowth");
        // 1读取配置
        int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));// 最大
        int minSupport = Integer.valueOf(params.get("minSupport", "3"));// 最小置信度
        Path output = new Path(params.get("output", "output.txt"));
        Path input = new Path(params.get("input"));
        // 2准备算法参数
        Configuration conf = new Configuration();
        // FileSystem fs = FileSystem.get(output.toUri(), conf);
        FileSystem fs = FileSystem.get(URI.create("hdfs://quickstart.cloudera:8020"), conf);
        Charset encoding = Charset.forName(params.get("encoding"));
        String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, TopKStringPatterns.class);
        FSDataInputStream inputStream = null;
        FSDataInputStream inputStreamAgain = null;
        Collection<String> features = Sets.newHashSet();
        // 3开始寻找频繁模式
        if ("true".equals(params.get(PFPGrowth.USE_FPG2))) {
            doFP2(fs, inputStream, inputStreamAgain, input, output, maxHeapSize, minSupport, features, encoding, pattern, writer);
        } else {
            doFP(fs, inputStream, inputStreamAgain, input, output, maxHeapSize, minSupport, features, encoding, pattern, writer);
        }
        // 4结果打印
        print(conf, new Path(params.get("output", "output.txt")));
    }
    static void print(Configuration conf, Path path) {
        List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(conf, path);
        System.out.println("-------------------------------------");
        for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
            System.err.println(entry.getFirst() + "...." + entry.getSecond());
            log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond());
            // doDAO( entry.getSecond()));//result to db for show web side.
        }
        System.out.println("------------------over----------------");
        System.out.println("size:" + frequentPatterns.size());
    }
    private static void doFP2(FileSystem fs, FSDataInputStream inputStream, FSDataInputStream inputStreamAgain, Path input, Path output, int maxHeapSize, int minSupport, Collection<String> features, Charset encoding, String pattern, SequenceFile.Writer writer) throws IOException {
        org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthObj<String>();
        try {
            inputStream = fs.open(input);
            inputStreamAgain = fs.open(input);
            fp.generateTopKFrequentPatterns(
            //
                    new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern),//
                    fp.generateFList(new StringRecordIterator(new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport),//
                    minSupport,//
                    maxHeapSize,//
                    features,//
                    new StringOutputConverter(new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)));
        } finally {
            Closeables.close(writer, false);
            Closeables.close(inputStream, true);
            Closeables.close(inputStreamAgain, true);
        }
    }
    private static void doFP(FileSystem fs, FSDataInputStream inputStream, FSDataInputStream inputStreamAgain, Path input, Path output, int maxHeapSize, int minSupport, Collection<String> features, Charset encoding, String pattern, SequenceFile.Writer writer) throws IOException {
        FPGrowth<String> fp = new FPGrowth<String>();
        inputStream = fs.open(input);
        inputStreamAgain = fs.open(input);
        try {
            fp.generateTopKFrequentPatterns(new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern), fp.generateFList(new StringRecordIterator(new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, new StringOutputConverter(new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), new ContextStatusUpdater(null));
        } finally {
            Closeables.close(writer, false);
            Closeables.close(inputStream, true);
            Closeables.close(inputStreamAgain, true);
        }
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值