接《上》
一、MR步骤:
第一个MR 就是把输入数据的每个用户的信息整合下:
userid:1,vector:{103:2.5,102:3.0,101:5.0}
userid:2,vector:{104:2.0,103:5.0,102:2.5,101:2.0}
userid:3,vector:{107:5.0,105:4.5,104:4.0,101:2.5}
userid:4,vector:{106:4.0,104:4.5,103:3.0,101:5.0}
userid:5,vector:{106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0}
public void map(LongWritable key,Textvalue,Context context) throws IOException, InterruptedException{
VarLongWritable userID=newVarLongWritable();
LongWritable itemID=new LongWritable();
FloatWritable itemValue=newFloatWritable();
String line=value.toString();
String[]info=line.split(",");
if(info.length!=3){ return; } //uid,itemid,preference
userID.set(Long.parseLong(info[0]));
itemID.set(Long.parseLong(info[1]));
itemValue.set(Float.parseFloat(info[2]));
context.write(userID, newLongAndFloat(itemID,itemValue));
}
public class WiKiReducer1 extends Reducer<VarLongWritable,LongAndFloat,VarLongWritable,VectorWritable> {
public void reduce(VarLongWritable userID,Iterable<LongAndFloat> itemPrefs,Context context) throws IOException, InterruptedException{
// RandomAccessSparseVector(int cardinality, int initialCapacity)
Vector userVector=new RandomAccessSparseVector(Integer.MAX_VALUE,10);
for(LongAndFloat itemPref:itemPrefs){
userVector.set(Integer.parseInt(itemPref.getFirst().toString()),Float.parseFloat(itemPref.getSecond().toString()) );
}
context.write(userID, new VectorWritable(userVector));
// System.out.println("userid:"+userID+",vector:"+userVector);
}
类 LongAndFloat 用于存储数据并实现Writable的数据类型
------------------------------------------------------------------------------------------------------------
public class LongAndFloat implements WritableComparable<LongAndFloat> {
private LongWritable first;
private FloatWritable second;
public LongAndFloat(){
set(new LongWritable(),new FloatWritable());
}
public LongAndFloat(LongWritable l,FloatWritable f){
set(l,f);
}
public void set(LongWritable longWritable, FloatWritable intWritable) {
// TODO Auto-generated method stub
this.first=longWritable;
this.second=intWritable;
}
public LongWritable getFirst(){
return first;
}
public FloatWritable getSecond(){
return second;
}
@Override
public void readFields(DataInput arg0) throws IOException {
// TODO Auto-generated method stub
first.readFields(arg0);
second.readFields(arg0);
}
@Override
public void write(DataOutput arg0) throws IOException {
// TODO Auto-generated method stub
first.write(arg0);
second.write(arg0);
}
@Override
public int compareTo(LongAndFloat o) {
// TODO Auto-generated method stub
int cmp=first.compareTo(o.first);
if(cmp!=0){
return cmp;
}
return second.compareTo(o.second);
}
}
二、第二个MR:
输入数据为MR(1) 的输出,只是项目item的相似度,先不管用户ID,直接对后面的所有项目进行拆分。
输出应该类似下面:
Item_id1:Item_id2 次数
101,{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}
102,{106:1.0,105:1.0,104:2.0,103:3.0,102:3.0,101:3.0}
1.main
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf1 = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: WiKiDriver2 <in> <out>");
System.exit(2);
}
Job job1 = new Job(conf1, "wiki job two");
job1.setNumReduceTasks(1);
job1.setJarByClass(WiKiDriver2.class);
job1.setInputFormatClass(SequenceFileInputFormat.class);
job1.setMapperClass(WikiMapper2.class);
job1.setMapOutputKeyClass(IntWritable.class);
job1.setMapOutputValueClass(IntWritable.class);
job1.setReducerClass(WiKiReducer2.class);
job1.setOutputKeyClass(IntWritable.class);
job1.setOutputValueClass(VectorWritable.class);
job1.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));
SequenceFileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));
if(!job1.waitForCompletion(true)){
System.exit(1); // run error then exit
}
}
2.Mapper
publicclass WikiMapper2 extends Mapper<VarLongWritable,VectorWritable,IntWritable,IntWritable>{
public void map(VarLongWritableuserID,VectorWritable userVector,Context context) throws IOException,InterruptedException{
Iterator<Vector.Element>it=userVector.get().iterateNonZero();
while(it.hasNext()){
int index1=it.next().index();
// System.out.println("index1:"+index1);
Iterator<Vector.Element>it2=userVector.get().iterateNonZero();
while(it2.hasNext()){
intindex2=it2.next().index();
// test
/*if(index1==101){
System.out.println("index1:"+index1+",index2:"+index2);
}*/
context.write(newIntWritable(index1), new IntWritable(index2));
}
}
}
}
3.reduce
public class WiKiReducer2 extendsReducer<IntWritable,IntWritable,IntWritable,VectorWritable> {
public void reduce(IntWritable itemIndex1,Iterable<IntWritable>itemPrefs,Context context) throws IOException, InterruptedException{
// RandomAccessSparseVector(intcardinality, int initialCapacity)
Vector itemVector=newRandomAccessSparseVector(Integer.MAX_VALUE,10);
for(IntWritable itemPref:itemPrefs){
intitemIndex2=itemPref.get();
itemVector.set(itemIndex2,itemVector.get(itemIndex2)+1.0);
}
context.write(itemIndex1, new VectorWritable(itemVector));
// System.out.println(itemIndex1+","+itemVector);
}
}
三、第三个MR:
含有两个Mapper,第一个MR(31)把MR(2)的输出的格式转为VectorOrPrefWritable;
MR2为用户评分矩阵
Item_id user_id:preference
101,{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}
MR(32)针对MR(1)的输出把每一个项目ID和用户ID作为一对进行输出,输出格式也为VectorOrPrefWritable;
MR1生成物品同现矩阵
Item_id1:Item_id2 次数
userid:5,vector:{106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0}
VectorOrPrefWritable
input: MR2的输出userVectors
map: 输出:(itemId,VectorOrPrefWritable<userId, pref>)
1.main
public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf1 = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: WiKiDriver31 <in><out>");
System.exit(2);
}
Job job1 = new Job(conf1, "wiki job three1");
job1.setOutputFormatClass(SequenceFileOutputFormat.class);
job1.setInputFormatClass(SequenceFileInputFormat.class);
job1.setNumReduceTasks(1);
job1.setJarByClass(WiKiDriver31.class);
job1.setMapperClass(WikiMapper31.class);
job1.setMapOutputKeyClass(IntWritable.class);
job1.setMapOutputValueClass(VectorOrPrefWritable.class);
// set a reducer only to use SequenceFileOutputFormat
job1.setReducerClass(WiKiReducer31.class);
job1.setOutputKeyClass(IntWritable.class);
job1.setOutputValueClass(VectorOrPrefWritable.class);
// this MR's input is the MR2's output
SequenceFileInputFormat.addInputPath(job1, newPath(PATH+otherArgs[0]));
SequenceFileOutputFormat.setOutputPath(job1, newPath(PATH+otherArgs[1]));
if(!job1.waitForCompletion(true)){
System.exit(1); // run error then exit
}
}
}
2map
publicclass WikiMapper31 extends Mapper<IntWritable,VectorWritable,IntWritable,VectorOrPrefWritable>{
public void map(IntWritablekey,VectorWritable value,Context context) throws IOException, InterruptedException{
context.write(key, newVectorOrPrefWritable(value.get()));
// System.out.println("key"+key.toString()+",vlaue"+value.get());
}
}
3.reduce
public class WiKiReducer31 extends Reducer<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {
public void reduce(IntWritable key,Iterable<VectorOrPrefWritable> values ,Context context ) throws IOException, InterruptedException{
for(VectorOrPrefWritable va:values){
context.write(key, va); }
}
}
四、第四个MR:
MR4的map不做任何事情;MR4的reduce输出就是把MR(31)和MR(32)的相同的itemID整合一下而已(注意此处的输入为两个路径):如下:
101 {107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} [5 1 4 23] [4.0 5.0 5.0 2.0 2.5]
101共现矩阵
101{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}
101用户评分矩阵
[5 1 4 2 3] [4.0 5.0 5.0 2.0 2.5]
Item_id user_id:preference
101 2:2.0
101 5:4.0
101 4:5.0
101 3:2.0
101 1:5.0
1.main
public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf1 = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: WiKiDriver4 <in1><in2><out>");
System.exit(2);
}
Job job1 = new Job(conf1, "wiki job four");
job1.setNumReduceTasks(1);
job1.setJarByClass(WiKiDriver4.class);
job1.setInputFormatClass(SequenceFileInputFormat.class);
job1.setMapperClass(WikiMapper4.class);
job1.setMapOutputKeyClass(IntWritable.class);
job1.setMapOutputValueClass(VectorOrPrefWritable.class);
job1.setReducerClass(WiKiReducer4.class);
job1.setOutputKeyClass(IntWritable.class);
job1.setOutputValueClass(VectorAndPrefsWritable.class);
job1.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileInputFormat.addInputPath(job1, newPath(PATH+otherArgs[0]));
SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[1]));
SequenceFileOutputFormat.setOutputPath(job1, newPath(PATH+otherArgs[2]));
if(!job1.waitForCompletion(true)){
System.exit(1); // run error then exit
}
}
}
2.map
public class WikiMapper4 extends Mapper<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {
public void map(IntWritable key,VectorOrPrefWritable value,Context context) throws IOException, InterruptedException{
context.write(key, value);
}
}
3.reduce
public class WiKiReducer4 extendsReducer<IntWritable,VectorOrPrefWritable,IntWritable,VectorAndPrefsWritable>{
public void reduce(IntWritable key, Iterable<VectorOrPrefWritable>values,Context context) throws IOException, InterruptedException{
List<Long> userfs=new ArrayList<Long>(); //userID
List<Float> prefs=new ArrayList<Float>(); // pref
Vector v=null;
for(VectorOrPrefWritable value:values){
if(value.getVector()!=null){
v=value.getVector();
}else{
userfs.add(value.getUserID());
prefs.add(value.getValue());
}
}
context.write(key, new VectorAndPrefsWritable(v,userfs,prefs));
//System.out.println("key ,itemid:"+key.toString()+",information:"+v+","+userfs+","+prefs);
}
}
五、第五个MR:
map:针对MR4的输出的每一行中的每一个用户,用这个用户的评分值(value)去乘以项目之间的相似度向量,比如针对第一条记录中的用户3,则有 Vectorforuser3=[1.0 2.0 2.0 4.0 4.0 3.0 5.0]* 2.5 则map的输出为 key :3 value : Vectorforuser3;
map的输出应该如下所示:
alluserids:[5, 1, 4, 2, 3]
,userid:5,vector:{107:4.0,106:8.0,105:8.0,104:16.0,103:16.0,102:12.0,101:20.0}
,userid:1,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0}
,userid:4,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0}
,userid:2,vector:{107:2.0,106:4.0,105:4.0,104:8.0,103:8.0,102:6.0,101:10.0}
,userid:3,vector:{107:2.5,106:5.0,105:5.0,104:10.0,103:10.0,102:7.5,101:12.5}
Combine : 针对map的输出,把相同 key(即userID)的向量对应相加,得到的向量和即为该userID的对各个项目的评分;
combine的输出应该如下所示:
userid:1,vecotr:{107:5.0,106:18.0,105:15.5,104:33.5,103:39.0,102:31.5,101:44.0}
userid:2,vecotr:{107:4.0,106:20.5,105:15.5,104:36.0,103:41.5,102:32.5,101:45.5}
Reduce:针对combine的输出,把用户已经评价过分的项目筛选掉,然后按照评分值的大小有大到小排序输出,即为用户推荐项目;
最后的输出为:
1 [104:33.5,106:18.0,105:15.5,107:5.0]
2 [106:20.5,105:15.5,107:4.0]
3 [103:26.5,102:20.0,106:17.5]
4 [102:37.0,105:26.0,107:9.5]
5 [107:11.5]
1.main
public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf1 = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: WiKiDriver5 <in><out>");
System.exit(2);
}
Job job1 = new Job(conf1, "wiki job five");
job1.setNumReduceTasks(1);
job1.setJarByClass(WiKiDriver5.class);
job1.setInputFormatClass(SequenceFileInputFormat.class);
job1.setMapperClass(WikiMapper5.class);
job1.setMapOutputKeyClass(VarLongWritable.class);
job1.setMapOutputValueClass(VectorWritable.class);
job1.setCombinerClass(WiKiCombiner5.class);
job1.setReducerClass(WiKiReducer5.class);
job1.setOutputKeyClass(VarLongWritable.class);
job1.setOutputValueClass(RecommendedItemsWritable.class);
// job1.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));
FileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));
if(!job1.waitForCompletion(true)){
System.exit(1); // run error then exit
}
}
}
2.Map
publicclass WikiMapper5 extends Mapper<IntWritable,VectorAndPrefsWritable,VarLongWritable,VectorWritable>{
public void map(IntWritablekey,VectorAndPrefsWritable vectorAndPref,Context context) throws IOException,InterruptedException{
Vector coo=vectorAndPref.getVector();
List<Long>userIds=vectorAndPref.getUserIDs();
List<Float>prefValues=vectorAndPref.getValues();
//System.out.println("alluserids:"+userIds);
for(inti=0;i<userIds.size();i++){
long userID=userIds.get(i);
floatprefValue=prefValues.get(i);
Vectorpar=coo.times(prefValue);
context.write(newVarLongWritable(userID), new VectorWritable(par));
//System.out.println(",userid:"+userID+",vector:"+par); // ifthe user id = 3 is the same as my paper then is right
}
}
}
3.combine
publicclass WiKiCombiner5 extendsReducer<VarLongWritable,VectorWritable,VarLongWritable,VectorWritable>{
public void reduce(VarLongWritable key,Iterable<VectorWritable> values,Context context) throws IOException,InterruptedException{
Vector partial=null;
for(VectorWritable v:values){
partial=partial==null?v.get():partial.plus(v.get());
}
context.write(key, newVectorWritable(partial));
System.out.println("userid:"+key.toString()+",vecotr:"+partial);// here also should be the same as my paper'sresult
}
}
4.reduce
publicclass WiKiReducer5 extendsReducer<VarLongWritable,VectorWritable,VarLongWritable,RecommendedItemsWritable>{
private intrecommendationsPerUser=RECOMMENDATIONSPERUSER;
private String path=JOB1OUTPATH;
private staticFastMap<Integer,String> map=new FastMap<Integer,String>();
public void setup(Context context) throwsIOException{
Configuration conf=newConfiguration();
FileSystemfs=FileSystem.get(URI.create(path), conf);
Path tempPath=new Path(path);
SequenceFile.Reader reader=null;
try {
reader=new SequenceFile.Reader(fs,tempPath, conf);
Writablekey=(Writable)ReflectionUtils.newInstance(reader.getKeyClass(),conf);
Writable value = (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);
// long position = reader.getPosition();
while (reader.next(key, value)){
map.put(Integer.parseInt(key.toString()), value.toString());
// System.out.println(key.toString()+","+value.toString());
// position = reader.getPosition(); //beginning of next record
}
} catch (Exception e) {
// TODO Auto-generated catchblock
e.printStackTrace();
}
}
public void reduce(VarLongWritable key,Iterable<VectorWritable> values,Context context) throws IOException,InterruptedException{
int userID=(int)key.get();
Vector rev=null;
for(VectorWritable vec:values){
rev=rev==null?vec.get():rev.plus(vec.get());
}
Queue<RecommendedItem>topItems=new PriorityQueue<RecommendedItem>(
recommendationsPerUser+1,
Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance())
);
Iterator<Vector.Element>recommendationVectorIterator=
rev.iterateNonZero();
while(recommendationVectorIterator.hasNext()){
Vector.Elemente=recommendationVectorIterator.next();
int index=e.index();
System.out.println("Vecotr.element.indxe:"+index); // test here find the index is itemid or not ** test result : index isitem
if(!hasItem(userID,String.valueOf(index))){
float value=(float)e.get();
if(topItems.size()<recommendationsPerUser){
// here only set index
topItems.add(newGenericRecommendedItem(index,value));
}elseif(value>topItems.peek().getValue()){
topItems.add(newGenericRecommendedItem(index,value));
topItems.poll();
}
}
}
List<RecommendedItem>recom=newArrayList<RecommendedItem>(topItems.size());
recom.addAll(topItems);
Collections.sort(recom,ByValueRecommendedItemComparator.getInstance());
context.write(key, new RecommendedItemsWritable(recom));
}
public static boolean hasItem(int user,String item){ // to check whether the user has rate theitem
boolean flag=false;
String items=map.get(user);
if(items.contains(item)){
flag=true;
}
return flag;
}
}
最后一个reducer的编写基本思路:在Reducer的setup函数中读取SequenceFile的数据,这个数据是MR1的输出数据,用来排除用户已经评价过的项目。
在最后一个Reducer中我也用了一个FastMap,这个类是Mahout的,应该用他提供的一些类会运行的更加快吧。