《Data Algorithm
》读书笔记七 — 购物篮分析
1. 购物篮分析(Market Basket Analysis,MBA
) 简介
购物篮分析 可以帮助我们找出很有可能会一起购买的商品,关联规则挖掘 会发现一个交易清单中商品之间的相关性。为购物篮分析挖掘关联规则时,要找出频繁商品集,这是一个计算密集型问题,非常适合使用MapReduce
解决。
2. 分析方法
查找购物篮中最常出现的 N
阶商品 TupleN(N =1,2,3,4...)
一旦得到最频繁项集 Fi(i=1,2,...)
可以用它们生成交易的一个关联规则
商品:{A,B,C,D,E}
T1:A,C
T2:B,D
T3:A,C,E
T4:C,E
T5:A,B,E
T6:B,E
F1 ={[C,3], [A,3] , [B,3] , [E,4]}
F2 ={[<A,C>,2], [<C,E>,2], [<A,E>,2], [<B,E>,2] }
例如,有交易:T1:crackers,icecream,coke,apple
,我们可以得到如下的 二元组:
<crackers,icecream>
<crackers,coke>
<crackers,apple>
<icecream,coke>
<icecream,apple>
<coke,apple>
得到如下的三元组:
<crackers,icecream,coke>
<crackers,icecream,apple>
<icecream,coke,apple>
<coke,apple>
和 <apple,coke>
实质相同,但是在这里却被当做了不同。 避免这个问题的产生方法是:先对交易商品按字母顺序排序 ,再生成这个键值对。
3.需求
给出一个交易日志,求出其两两组合中的分别频率。
3.1 测试输入
T1:crackers,bread,banana
T2:crackers,coke,butter,coffee
T3:crackers,bread
T4:crackers,bread
T5:crackers,bread,coffee
T6:butter,coke
T7:butter,coke,bread,crackers
3.2 测试输出
[bread,banana] 1
[butter,bread] 1
[coffee,bread] 1
[coffee,butter] 1
[coke,bread] 1
[coke,butter] 3
[coke,coffee] 1
[crackers,banana] 1
[crackers,bread] 5
[crackers,butter] 2
[crackers,coffee] 2
[crackers,coke] 2
4.实现思路
- step 1:根据得到的交易清单,将每行的物品按照分隔符分割,然后将其字母顺序排序,接着将得到的结果生成一个
combineKey
。 - step 2:
Mapper
发出的键值对是<Text,IntWritable>
。在Reducer
中对其进行一个简单的累加即可。
5.实现代码
5.1 MBAJobDriver
package data_algorithm.chapter_7;
import data_algorithm.utils.HdfsUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MBAJobDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
HdfsUtils.deletePath(args[1]);
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
Path inPath = new Path(args[0]);
Path outPath = new Path(args[1]);
job.setJarByClass(MBAJobDriver.class);
FileInputFormat.setInputPaths(job,inPath);
FileOutputFormat.setOutputPath(job,outPath);
job.setMapperClass(MBAMapper.class);
job.setReducerClass(MBAReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
int status = job.waitForCompletion(true) ? 0:1;
return status;
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("parameter is not valid!");
System.exit(1);
}
ToolRunner.run(new MBAJobDriver(), args);
}
}
5.2 MBAMapper
package data_algorithm.chapter_7;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Arrays;
public class MBAMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//T1:crackers,icecream,coke,apple
String [] line = value.toString().split("[,:]");// split by comma or colon
Arrays.sort(line); // sort
System.out.println(Arrays.toString(line));
for(int i = 1;i<line.length;i++) {
String curWord = line[i];
String combineKey ;
for(int j = 1;j < line.length && i!=j; j++) {
combineKey = "[" + curWord + "," + line[j] + "]";
context.write(new Text(combineKey),new IntWritable(1));
}
}
}
public static void main(String[] args) {
String value = "T1:crackers,icecream,coke,apple";
String [] line = value.toString().split("[,:]");// split by comma or colon
Arrays.sort(line);
for(int i = 1;i<line.length;i++) {
System.out.print(line[i]+" ");
}
}
}
5.3 MBAReducer
package data_algorithm.chapter_7;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MBAReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable inw : values) {
sum += inw.get();
}
context.write(key,new IntWritable(sum));
}
}