Hadoop/MapReduce购物篮分析:关联规则挖掘

购物篮分析

目的:查找一个给定超市或者网店购物篮中最常出现的商品对(阶数为1,2...)
例如:如果有5个商品{A,B,C,D,E},对应以下6个交易:
Transaction 1:A,C
Transaction 2:B,D
Transaction 3:A,C,E
Transaction 4:C,E
Transaction 5:A,B,E
Transaction 6:B,E
我们的目标是构建项集F1(大小=1)F2(大小=2)
F1={[C,3],[A,3],[B,3],[E,4]}
F2={[<A,C>,2],[<C,E>,2],[<A,E>,2],[<B,E>,2]}
那么问题来了:为什么没有D呢?
在这个例子中,我们使用的最小支持度为2。支持度是一个模式在整个交易集中出现的次数,因此要去除[D,1]
项集F1F2可以用来生成交易的关联规则。关联规则形式:
LHS(左件) => RHS(右件)
可乐 => 薯片
如果顾客购买可乐,他们也会购买薯片。
关联规则的两个度量标准:
支持度,是一个模式在整个交易集中出现的次数
置信度,关联规则中左件与右件同时出现的频繁程度


输入:
crackers,bread,banana
crackers,coke,butter,coffee
crackers,bread
crackers,bread,coffee
butter,coke
butter,coke,bread,crackers

思路:
每个map接受一个交易,这是一个顾客购买的一个商品集{I1,I2..In}。映射器首先对这些商品排序(升序或者降序),生成{S1,S2...Sn}
然后发出(key,1)对,这里key=Tuple2(Si,Sj),Si<=Sj,而且value1,表示这个键已经见过一次。
组合器和规约器的任务是聚集和统计频度。

对商品进行排序?
避免类似(crackers,bread)和(bread,crackers)的重复键

新建一个工具类Combinations
定义一个findSortedCombinations方法,可以为任意阶数创建一个唯一组合
eg:
List<String> elements = Arrays.asList("a", "b", "c", "d", "e");
List<List<String>> combinations = findSortedCombinations(elements, 2);

System.out.println(combinations);

结果为:

[[a, b], [a, c], [a, d], [a, e], [b, c], [b, d], [b, e], [c, d], [c, e], [d, e]]

package MBA;

import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;


public class Combination {
   
    
    public static <T extends Comparable<? super T>> List<List<T>> findSortedCombinations(Collection<T> elements) {
        List<List<T>> result = new ArrayList<List<T>>();
        for (int i = 0; i <= elements.size(); i++) {
            result.addAll(findSortedCombinations(elements, i));
        }
        return result;
    }
    
    public static <T extends Comparable<? super T>> List<List<T>> findSortedCombinations(Collection<T> elements, int n) {
        List<List<T>> result = new ArrayList<List<T>>();
        
        if (n == 0) {
            result.add(new ArrayList<T>());
            return result;
        }
        
        List<List<T>> combinations = findSortedCombinations(elements, n - 1);//假设已经生成了n-1阶的唯一组合集合
        for (List<T> combination: combinations) {
            //对于n-1阶的唯一组合集合中的每一个集合
            for (T element: elements) {
                //对于给定组合中的每一个元素
                if (combination.contains(element)) {
                    continue;
                }
                //如果n-1阶的唯一组合集合中的这个集合中不包含这个元素,证明这个元素可以跟这个集合一起构成n阶唯一组合集合中的一个元素
                List<T> list = new ArrayList<T>();
                list.addAll(combination);
                
                if (list.contains(element)) {
                    continue;
                }
                
                list.add(element);
                //sort items not to duplicate the items
                //   example: (a, b, c) and (a, c, b) might become  
                //   different items to be counted if not sorted   
                Collections.sort(list);
                
                if (result.contains(list)) {
                    continue;
                }
                
                result.add(list);
            }
        }
        return result;
    }
    
    /**
     * Basic Test of findSortedCombinations()
     * 
     * @param args 
     */
    public static void main(String[] args) {
        List<String> elements = Arrays.asList("a", "b", "c", "d", "e");
        List<List<String>> combinations = findSortedCombinations(elements, 2);
        System.out.println(combinations);
    }

}


package MBA;

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import org.apache.log4j.Logger;
import org.apache.commons.lang3.StringUtils;


public class MBAMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

   public static final Logger THE_LOGGER = Logger.getLogger(MBAMapper.class);

   
   public static final int DEFAULT_NUMBER_OF_PAIRS = 2; //默认阶数为2阶

   private static final Text reducerKey = new Text(); 
   
   private static final IntWritable NUMBER_ONE = new IntWritable(1);

   int numberOfPairs; //记录设置的阶数
   
   @Override
   protected void setup(Context context) throws IOException, InterruptedException {
      this.numberOfPairs = context.getConfiguration().getInt("number.of.pairs", DEFAULT_NUMBER_OF_PAIRS);//从设置中得到阶数
      THE_LOGGER.info("setup() numberOfPairs = " + numberOfPairs);
    }

   @Override
   public void map(LongWritable key, Text value, Context context) 
      throws IOException, InterruptedException {
      String line = value.toString();
      List<String> items = convertItemsToList(line);//将每一行的商品名转为列表
      if ((items == null) || (items.isEmpty())) {
         return;
      }
      generateMapperOutput(numberOfPairs, items, context);
   }
   
   private static List<String> convertItemsToList(String line) {
      if ((line == null) || (line.length() == 0)) {
         return null;
      }      
      String[] tokens = StringUtils.split(line, ",");   
      if ( (tokens == null) || (tokens.length == 0) ) {
         return null;
      }
      List<String> items = new ArrayList<String>();         
      for (String token : tokens) {
         if (token != null) {
             items.add(token.trim());
         }         
      }         
      return items;
   }
   
 /***
  * 产生映射器输出
  * @param numberOfPairs
  * @param items
  * @param context
  * @throws IOException
  * @throws InterruptedException
  */
   private void generateMapperOutput(int numberOfPairs, List<String> items, Context context) 
      throws IOException, InterruptedException {
      List<List<String>> sortedCombinations = Combination.findSortedCombinations(items, numberOfPairs);
      for (List<String> itemList: sortedCombinations) {
         System.out.println("itemlist="+itemList.toString());
         reducerKey.set(itemList.toString());
         context.write(reducerKey, NUMBER_ONE);
      }   
   }
   
}



package MBA;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * reduce的任务很简单,对于相同key出现的次数进行累加求和
 * @author chenjie
 *
 */
public class MBAReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
   @Override
   public void reduce(Text key, Iterable<IntWritable> values, Context context) 
      throws IOException, InterruptedException {
      int sum = 0; // total items paired
      for (IntWritable value : values) {
         sum += value.get();
      }
      context.write(key, new IntWritable(sum));
   }
}


package MBA;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.log4j.Logger;



public class MBADriver extends Configured implements Tool {
    private static final String INPATH = "input/mba.txt";// 输入文件路径
    private static final String OUTPATH = "output/mba";// 输出文件路径
    
   public static final Logger THE_LOGGER = Logger.getLogger(MBADriver.class);

   // main to start from the command
   public static void main(String args[]) throws Exception {
       args = new String[3];
       args[0] = INPATH;
       args[1] = OUTPATH;
       args[2] = "2";
      if(args.length != 3){
         printUsage();
         System.exit(1);
      }

      int exitStatus = ToolRunner.run(new MBADriver(), args);
      THE_LOGGER.info("exitStatus="+exitStatus);
      System.exit(exitStatus);
   }      
   
   private static int printUsage(){
      System.out.println("USAGE: [input-path] [output-path] [number-of-pairs]");
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
   }


   @Override
   public int run(String args[]) throws Exception {
      String inputPath = args[0];
      String outputPath = args[1];
      int numberOfPairs = Integer.parseInt(args[2]);
      
      THE_LOGGER.info("inputPath: " + inputPath);
      THE_LOGGER.info("outputPath: " + outputPath);
      THE_LOGGER.info("numberOfPairs: " + numberOfPairs);
      
      // job configuration
      Job job = new Job(getConf());
      job.setJobName("MBADriver");
      job.getConfiguration().setInt("number.of.pairs", numberOfPairs);   
      		
      // job.setJarByClass(MBADriver.class);
      // add jars to distributed cache
    //  HadoopUtil.addJarsToDistributedCache(job, "/lib/");


      //input/output path
      FileInputFormat.setInputPaths(job, new Path(inputPath));
      FileOutputFormat.setOutputPath(job, new Path(outputPath));      

      //Mapper K, V output
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(IntWritable.class);   
      //output format
      job.setOutputFormatClass(TextOutputFormat.class);
      
      //Reducer K, V output
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);
      
      // set mapper/reducer
      job.setMapperClass(MBAMapper.class);
      job.setCombinerClass(MBAReducer.class);
      job.setReducerClass(MBAReducer.class);
      
      //delete the output path if exists to avoid "existing dir/file" error
      Path outputDir = new Path(outputPath);
      FileSystem.get(getConf()).delete(outputDir, true);
      
      long startTime = System.currentTimeMillis();
      boolean status = job.waitForCompletion(true);
      THE_LOGGER.info("job status="+status);
      long endTime = System.currentTimeMillis();
      long elapsedTime =  endTime - startTime;
      THE_LOGGER.info("Elapsed time: " + elapsedTime + " milliseconds");
   
      return status ? 0 : 1;      
   }

}


结果:

[banana, bread] 1
[banana, crackers] 1
[bread, butter] 1
[bread, coffee] 1
[bread, coke] 1
[bread, crackers] 4
[butter, coffee] 1
[butter, coke] 3
[butter, crackers] 2
[coffee, coke] 1
[coffee, crackers] 2
[coke, crackers] 2




  • 1
    点赞
  • 21
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值