协同过滤算法(下)

接《上》

一、MR步骤:

第一个MR 就是把输入数据的每个用户的信息整合下:

userid:1,vector:{103:2.5,102:3.0,101:5.0} 

userid:2,vector:{104:2.0,103:5.0,102:2.5,101:2.0} 

userid:3,vector:{107:5.0,105:4.5,104:4.0,101:2.5} 

userid:4,vector:{106:4.0,104:4.5,103:3.0,101:5.0} 

userid:5,vector:{106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0} 

 

public void map(LongWritable key,Textvalue,Context context) throws IOException, InterruptedException{ 

        

VarLongWritable userID=newVarLongWritable(); 

LongWritable itemID=new LongWritable(); 

FloatWritable itemValue=newFloatWritable(); 

String line=value.toString(); 

String[]info=line.split(","); 

if(info.length!=3){ return; }    //uid,itemid,preference

userID.set(Long.parseLong(info[0])); 

itemID.set(Long.parseLong(info[1])); 

itemValue.set(Float.parseFloat(info[2])); 

context.write(userID, newLongAndFloat(itemID,itemValue)); 

 

public class WiKiReducer1 extends Reducer<VarLongWritable,LongAndFloat,VarLongWritable,VectorWritable> {  

      

        public void reduce(VarLongWritable userID,Iterable<LongAndFloat> itemPrefs,Context context) throws IOException, InterruptedException{  

            // RandomAccessSparseVector(int cardinality, int initialCapacity)   

            Vector userVector=new RandomAccessSparseVector(Integer.MAX_VALUE,10);  

            for(LongAndFloat itemPref:itemPrefs){  

                userVector.set(Integer.parseInt(itemPref.getFirst().toString()),Float.parseFloat(itemPref.getSecond().toString()) );  

            }  

            context.write(userID, new VectorWritable(userVector));  

    //      System.out.println("userid:"+userID+",vector:"+userVector);  

        }  

 

类 LongAndFloat 用于存储数据并实现Writable的数据类型

------------------------------------------------------------------------------------------------------------

public class LongAndFloat implements WritableComparable<LongAndFloat> {  

    private LongWritable first;  

    private FloatWritable second;  

    public LongAndFloat(){  

        set(new LongWritable(),new FloatWritable());  

    }  

    public LongAndFloat(LongWritable l,FloatWritable f){  

        set(l,f);  

    }  

    public  void set(LongWritable longWritable, FloatWritable intWritable) {  

        // TODO Auto-generated method stub  

        this.first=longWritable;  

        this.second=intWritable;  

    }  

    public LongWritable getFirst(){  

        return first;  

    }  

    public FloatWritable getSecond(){  

        return second;  

    }  

    @Override  

    public void readFields(DataInput arg0) throws IOException {  

        // TODO Auto-generated method stub  

        first.readFields(arg0);  

        second.readFields(arg0);  

    }  

    @Override  

    public void write(DataOutput arg0) throws IOException {  

        // TODO Auto-generated method stub  

        first.write(arg0);  

        second.write(arg0);  

    }  

    @Override  

    public int compareTo(LongAndFloat o) {  

        // TODO Auto-generated method stub  

        int cmp=first.compareTo(o.first);  

        if(cmp!=0){  

            return cmp;  

        }  

        return second.compareTo(o.second);  

    }     

}  

 

 

二、第二个MR:

输入数据为MR(1) 的输出,只是项目item的相似度,先不管用户ID,直接对后面的所有项目进行拆分。

输出应该类似下面:

Item_id1:Item_id2        次数

101,{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} 

102,{106:1.0,105:1.0,104:2.0,103:3.0,102:3.0,101:3.0} 

 

1.main

 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        // TODO Auto-generated method stub  

        Configuration conf1 = new Configuration();  

        String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();        

        if (otherArgs.length != 2) {  

          System.err.println("Usage: WiKiDriver2 <in> <out>");  

          System.exit(2);  

        }  

        Job job1 = new Job(conf1, "wiki  job two");  

        job1.setNumReduceTasks(1);  

        job1.setJarByClass(WiKiDriver2.class);  

        job1.setInputFormatClass(SequenceFileInputFormat.class);  

        job1.setMapperClass(WikiMapper2.class);  

        job1.setMapOutputKeyClass(IntWritable.class);  

        job1.setMapOutputValueClass(IntWritable.class);  

        job1.setReducerClass(WiKiReducer2.class);  

        job1.setOutputKeyClass(IntWritable.class);  

        job1.setOutputValueClass(VectorWritable.class);  

        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  

        SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));  

        SequenceFileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));     

        if(!job1.waitForCompletion(true)){  

            System.exit(1); // run error then exit  

        }  

    }  

 

2.Mapper

publicclass WikiMapper2 extends Mapper<VarLongWritable,VectorWritable,IntWritable,IntWritable>{ 

     

    public void map(VarLongWritableuserID,VectorWritable userVector,Context context) throws IOException,InterruptedException{ 

        Iterator<Vector.Element>it=userVector.get().iterateNonZero(); 

        while(it.hasNext()){ 

            int index1=it.next().index(); 

    //     System.out.println("index1:"+index1); 

            Iterator<Vector.Element>it2=userVector.get().iterateNonZero(); 

            while(it2.hasNext()){ 

                intindex2=it2.next().index();                   

                //  test 

                /*if(index1==101){

                    System.out.println("index1:"+index1+",index2:"+index2);

                }*/ 

                context.write(newIntWritable(index1), new IntWritable(index2)); 

            } 

        } 

    } 

3.reduce

public class WiKiReducer2 extendsReducer<IntWritable,IntWritable,IntWritable,VectorWritable> {       

       public void reduce(IntWritable itemIndex1,Iterable<IntWritable>itemPrefs,Context context) throws IOException, InterruptedException{ 

           // RandomAccessSparseVector(intcardinality, int initialCapacity)  

           Vector itemVector=newRandomAccessSparseVector(Integer.MAX_VALUE,10); 

           for(IntWritable itemPref:itemPrefs){ 

                intitemIndex2=itemPref.get(); 

                itemVector.set(itemIndex2,itemVector.get(itemIndex2)+1.0); 

           } 

           context.write(itemIndex1, new VectorWritable(itemVector)); 

                     // System.out.println(itemIndex1+","+itemVector); 

       } 

 

三、第三个MR:

含有两个Mapper,第一个MR(31)把MR(2)的输出的格式转为VectorOrPrefWritable;

 

MR2为用户评分矩阵

Item_id   user_id:preference

101,{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} 

 

MR(32)针对MR(1)的输出把每一个项目ID和用户ID作为一对进行输出,输出格式也为VectorOrPrefWritable;

 

MR1生成物品同现矩阵

Item_id1:Item_id2        次数

userid:5,vector:{106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0}

 

VectorOrPrefWritable

input: MR2的输出userVectors

map: 输出:(itemId,VectorOrPrefWritable<userId, pref>)

1.main

public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException { 

       // TODO Auto-generated method stub 

       Configuration conf1 = new Configuration(); 

         

       String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();       

       if (otherArgs.length != 2) { 

         System.err.println("Usage: WiKiDriver31 <in><out>"); 

         System.exit(2); 

       } 

       Job job1 = new Job(conf1, "wiki job three1");  

       job1.setOutputFormatClass(SequenceFileOutputFormat.class); 

       job1.setInputFormatClass(SequenceFileInputFormat.class); 

       job1.setNumReduceTasks(1); 

       job1.setJarByClass(WiKiDriver31.class); 

       job1.setMapperClass(WikiMapper31.class); 

       job1.setMapOutputKeyClass(IntWritable.class); 

       job1.setMapOutputValueClass(VectorOrPrefWritable.class); 

         

       // set a reducer only to use SequenceFileOutputFormat 

       job1.setReducerClass(WiKiReducer31.class); 

       job1.setOutputKeyClass(IntWritable.class); 

       job1.setOutputValueClass(VectorOrPrefWritable.class); 

         

       // this MR's input is the MR2's output 

       SequenceFileInputFormat.addInputPath(job1, newPath(PATH+otherArgs[0])); 

       SequenceFileOutputFormat.setOutputPath(job1, newPath(PATH+otherArgs[1]));    

       if(!job1.waitForCompletion(true)){ 

           System.exit(1); // run error then exit 

       } 

   }   

 

2map

publicclass WikiMapper31 extends Mapper<IntWritable,VectorWritable,IntWritable,VectorOrPrefWritable>{ 

     

    public void map(IntWritablekey,VectorWritable value,Context context) throws IOException, InterruptedException{ 

         

                context.write(key, newVectorOrPrefWritable(value.get())); 

        //     System.out.println("key"+key.toString()+",vlaue"+value.get()); 

            } 

    } 

 

3.reduce

public class WiKiReducer31 extends Reducer<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {  

    public void reduce(IntWritable key,Iterable<VectorOrPrefWritable> values ,Context context ) throws IOException, InterruptedException{            

        for(VectorOrPrefWritable va:values){  

            context.write(key, va);          }  

    }  

}  

四、第四个MR:

MR4的map不做任何事情;MR4的reduce输出就是把MR(31)和MR(32)的相同的itemID整合一下而已(注意此处的输入为两个路径):如下:

101 {107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} [5 1 4 23] [4.0 5.0 5.0 2.0 2.5] 

 

101共现矩阵

101{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}

101用户评分矩阵

[5 1 4 2 3] [4.0 5.0 5.0 2.0 2.5]

Item_id   user_id:preference

101      2:2.0

101      5:4.0

101      4:5.0

101      3:2.0

101      1:5.0

1.main

public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException { 

       // TODO Auto-generated method stub 

       Configuration conf1 = new Configuration(); 

       String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();       

       if (otherArgs.length != 3) { 

         System.err.println("Usage: WiKiDriver4 <in1><in2><out>"); 

         System.exit(2); 

       } 

       Job job1 = new Job(conf1, "wiki job four"); 

       job1.setNumReduceTasks(1); 

       job1.setJarByClass(WiKiDriver4.class); 

       job1.setInputFormatClass(SequenceFileInputFormat.class); 

       job1.setMapperClass(WikiMapper4.class);  

       job1.setMapOutputKeyClass(IntWritable.class); 

       job1.setMapOutputValueClass(VectorOrPrefWritable.class);     

       job1.setReducerClass(WiKiReducer4.class); 

       job1.setOutputKeyClass(IntWritable.class); 

      job1.setOutputValueClass(VectorAndPrefsWritable.class); 

       job1.setOutputFormatClass(SequenceFileOutputFormat.class); 

       SequenceFileInputFormat.addInputPath(job1, newPath(PATH+otherArgs[0])); 

       SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[1])); 

       SequenceFileOutputFormat.setOutputPath(job1, newPath(PATH+otherArgs[2]));    

       if(!job1.waitForCompletion(true)){ 

           System.exit(1); // run error then exit 

       } 

   } 

 

2.map

public class WikiMapper4 extends Mapper<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {    

    public void map(IntWritable key,VectorOrPrefWritable value,Context context) throws IOException, InterruptedException{  

        context.write(key, value);  

    }  

}  

 

3.reduce

public class WiKiReducer4 extendsReducer<IntWritable,VectorOrPrefWritable,IntWritable,VectorAndPrefsWritable>{ 

       public void reduce(IntWritable key, Iterable<VectorOrPrefWritable>values,Context context) throws IOException, InterruptedException{ 

           List<Long> userfs=new ArrayList<Long>();  //userID

           List<Float> prefs=new ArrayList<Float>();  // pref

           Vector v=null; 

           for(VectorOrPrefWritable value:values){ 

               if(value.getVector()!=null){ 

                    v=value.getVector(); 

                }else{ 

                   userfs.add(value.getUserID()); 

                   prefs.add(value.getValue()); 

                 } 

           } 

           context.write(key, new VectorAndPrefsWritable(v,userfs,prefs)); 

   //System.out.println("key ,itemid:"+key.toString()+",information:"+v+","+userfs+","+prefs); 

       }  

 

 

五、第五个MR:

map:针对MR4的输出的每一行中的每一个用户,用这个用户的评分值(value)去乘以项目之间的相似度向量,比如针对第一条记录中的用户3,则有 Vectorforuser3=[1.0 2.0 2.0 4.0 4.0 3.0 5.0]* 2.5  则map的输出为 key :3    value :  Vectorforuser3;

map的输出应该如下所示:

alluserids:[5, 1, 4, 2, 3]  

,userid:5,vector:{107:4.0,106:8.0,105:8.0,104:16.0,103:16.0,102:12.0,101:20.0}  

,userid:1,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0}  

,userid:4,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0}  

,userid:2,vector:{107:2.0,106:4.0,105:4.0,104:8.0,103:8.0,102:6.0,101:10.0}  

,userid:3,vector:{107:2.5,106:5.0,105:5.0,104:10.0,103:10.0,102:7.5,101:12.5}  

 

Combine : 针对map的输出,把相同 key(即userID)的向量对应相加,得到的向量和即为该userID的对各个项目的评分;

combine的输出应该如下所示:

userid:1,vecotr:{107:5.0,106:18.0,105:15.5,104:33.5,103:39.0,102:31.5,101:44.0} 

userid:2,vecotr:{107:4.0,106:20.5,105:15.5,104:36.0,103:41.5,102:32.5,101:45.5} 

Reduce:针对combine的输出,把用户已经评价过分的项目筛选掉,然后按照评分值的大小有大到小排序输出,即为用户推荐项目;

最后的输出为:

1  [104:33.5,106:18.0,105:15.5,107:5.0] 

2  [106:20.5,105:15.5,107:4.0] 

3  [103:26.5,102:20.0,106:17.5] 

4  [102:37.0,105:26.0,107:9.5] 

5  [107:11.5] 

 

1.main

public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {

       Configuration conf1 = new Configuration(); 

       String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();       

       if (otherArgs.length != 2) { 

         System.err.println("Usage: WiKiDriver5 <in><out>"); 

         System.exit(2); 

       } 

       Job job1 = new Job(conf1, "wiki job five"); 

       job1.setNumReduceTasks(1); 

       job1.setJarByClass(WiKiDriver5.class); 

       job1.setInputFormatClass(SequenceFileInputFormat.class); 

       job1.setMapperClass(WikiMapper5.class);  

       job1.setMapOutputKeyClass(VarLongWritable.class); 

       job1.setMapOutputValueClass(VectorWritable.class); 

         

       job1.setCombinerClass(WiKiCombiner5.class); 

       job1.setReducerClass(WiKiReducer5.class); 

       job1.setOutputKeyClass(VarLongWritable.class); 

       job1.setOutputValueClass(RecommendedItemsWritable.class); 

   //  job1.setOutputFormatClass(SequenceFileOutputFormat.class); 

       SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0])); 

 

       FileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));    

       if(!job1.waitForCompletion(true)){ 

           System.exit(1); // run error then exit 

       } 

   } 

 

2.Map

publicclass WikiMapper5 extends Mapper<IntWritable,VectorAndPrefsWritable,VarLongWritable,VectorWritable>{       

    public void map(IntWritablekey,VectorAndPrefsWritable vectorAndPref,Context context) throws IOException,InterruptedException{ 

        Vector coo=vectorAndPref.getVector(); 

        List<Long>userIds=vectorAndPref.getUserIDs(); 

        List<Float>prefValues=vectorAndPref.getValues(); 

       //System.out.println("alluserids:"+userIds); 

        for(inti=0;i<userIds.size();i++){ 

            long userID=userIds.get(i); 

            floatprefValue=prefValues.get(i); 

            Vectorpar=coo.times(prefValue); 

            context.write(newVarLongWritable(userID), new VectorWritable(par)); 

            //System.out.println(",userid:"+userID+",vector:"+par);  //  ifthe user id = 3 is the same as my paper then is right 

        }    

    } 

3.combine

publicclass WiKiCombiner5 extendsReducer<VarLongWritable,VectorWritable,VarLongWritable,VectorWritable>{ 

        public void reduce(VarLongWritable key,Iterable<VectorWritable> values,Context context) throws IOException,InterruptedException{ 

            Vector partial=null; 

            for(VectorWritable v:values){ 

               partial=partial==null?v.get():partial.plus(v.get()); 

            } 

            context.write(key, newVectorWritable(partial)); 

           System.out.println("userid:"+key.toString()+",vecotr:"+partial);//   here also should be the same as my paper'sresult 

        } 

4.reduce

publicclass WiKiReducer5 extendsReducer<VarLongWritable,VectorWritable,VarLongWritable,RecommendedItemsWritable>{       

    private intrecommendationsPerUser=RECOMMENDATIONSPERUSER; 

    private String path=JOB1OUTPATH;       

    private staticFastMap<Integer,String> map=new FastMap<Integer,String>(); 

    public void setup(Context context) throwsIOException{ 

        Configuration conf=newConfiguration(); 

        FileSystemfs=FileSystem.get(URI.create(path), conf); 

       Path tempPath=new Path(path); 

        SequenceFile.Reader reader=null; 

        try { 

            reader=new SequenceFile.Reader(fs,tempPath, conf); 

            Writablekey=(Writable)ReflectionUtils.newInstance(reader.getKeyClass(),conf); 

            Writable value = (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);  

        // long position = reader.getPosition();   

            while (reader.next(key, value)){   

               map.put(Integer.parseInt(key.toString()), value.toString()); 

        //     System.out.println(key.toString()+","+value.toString()); 

            //    position = reader.getPosition(); //beginning of next record   

            } 

        } catch (Exception e) { 

            // TODO Auto-generated catchblock 

            e.printStackTrace(); 

        }   

    } 

 

public void reduce(VarLongWritable key,Iterable<VectorWritable> values,Context context) throws IOException,InterruptedException{ 

         

           int userID=(int)key.get(); 

           Vector rev=null; 

           for(VectorWritable vec:values){ 

                rev=rev==null?vec.get():rev.plus(vec.get()); 

           } 

           Queue<RecommendedItem>topItems=new PriorityQueue<RecommendedItem>( 

                   recommendationsPerUser+1, 

                   Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance()) 

                    ); 

           Iterator<Vector.Element>recommendationVectorIterator= 

                    rev.iterateNonZero(); 

           while(recommendationVectorIterator.hasNext()){ 

                Vector.Elemente=recommendationVectorIterator.next(); 

                int index=e.index(); 

                System.out.println("Vecotr.element.indxe:"+index);  // test here  find the index is itemid or not  ** test result : index isitem 

               if(!hasItem(userID,String.valueOf(index))){ 

                    float value=(float)e.get(); 

                    if(topItems.size()<recommendationsPerUser){ 

                        //  here only set index 

                        topItems.add(newGenericRecommendedItem(index,value)); 

                    }elseif(value>topItems.peek().getValue()){ 

                        topItems.add(newGenericRecommendedItem(index,value)); 

                        topItems.poll(); 

                    } 

                } 

           } 

           List<RecommendedItem>recom=newArrayList<RecommendedItem>(topItems.size()); 

            recom.addAll(topItems); 

           Collections.sort(recom,ByValueRecommendedItemComparator.getInstance()); 

           context.write(key, new RecommendedItemsWritable(recom));         

       } 

     

   public static boolean hasItem(int user,String item){  // to check whether the user has rate theitem 

       boolean flag=false; 

       String items=map.get(user); 

       if(items.contains(item)){ 

           flag=true; 

       } 

       return flag; 

   } 

 

最后一个reducer的编写基本思路:在Reducer的setup函数中读取SequenceFile的数据,这个数据是MR1的输出数据,用来排除用户已经评价过的项目。

在最后一个Reducer中我也用了一个FastMap,这个类是Mahout的,应该用他提供的一些类会运行的更加快吧。


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值