协同过滤算法（下）

最新推荐文章于 2021-10-13 16:30:56 发布

Thomas__li

最新推荐文章于 2021-10-13 16:30:56 发布

阅读量365

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/Thomas__li/article/details/79228926

版权

大数据专栏收录该内容

12 篇文章 0 订阅

订阅专栏

接《上》

一、MR步骤：

第一个MR 就是把输入数据的每个用户的信息整合下：

userid:1,vector:{103:2.5,102:3.0,101:5.0}

userid:2,vector:{104:2.0,103:5.0,102:2.5,101:2.0}

userid:3,vector:{107:5.0,105:4.5,104:4.0,101:2.5}

userid:4,vector:{106:4.0,104:4.5,103:3.0,101:5.0}

userid:5,vector:{106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0}

public void map(LongWritable key,Textvalue,Context context) throws IOException, InterruptedException{

VarLongWritable userID=newVarLongWritable();

LongWritable itemID=new LongWritable();

FloatWritable itemValue=newFloatWritable();

String line=value.toString();

String[]info=line.split(",");

if(info.length!=3){ return; } //uid,itemid,preference

userID.set(Long.parseLong(info[0]));

itemID.set(Long.parseLong(info[1]));

itemValue.set(Float.parseFloat(info[2]));

context.write(userID, newLongAndFloat(itemID,itemValue));

}

public class WiKiReducer1 extends Reducer<VarLongWritable,LongAndFloat,VarLongWritable,VectorWritable> {

public void reduce(VarLongWritable userID,Iterable<LongAndFloat> itemPrefs,Context context) throws IOException, InterruptedException{

// RandomAccessSparseVector(int cardinality, int initialCapacity)

Vector userVector=new RandomAccessSparseVector(Integer.MAX_VALUE,10);

for(LongAndFloat itemPref:itemPrefs){

userVector.set(Integer.parseInt(itemPref.getFirst().toString()),Float.parseFloat(itemPref.getSecond().toString()) );

}

context.write(userID, new VectorWritable(userVector));

// System.out.println("userid:"+userID+",vector:"+userVector);

}

类 LongAndFloat 用于存储数据并实现Writable的数据类型

------------------------------------------------------------------------------------------------------------

public class LongAndFloat implements WritableComparable<LongAndFloat> {

private LongWritable first;

private FloatWritable second;

public LongAndFloat(){

set(new LongWritable(),new FloatWritable());

}

public LongAndFloat(LongWritable l,FloatWritable f){

set(l,f);

}

public void set(LongWritable longWritable, FloatWritable intWritable) {

// TODO Auto-generated method stub

this.first=longWritable;

this.second=intWritable;

}

public LongWritable getFirst(){

return first;

}

public FloatWritable getSecond(){

return second;

}

@Override

public void readFields(DataInput arg0) throws IOException {

// TODO Auto-generated method stub

first.readFields(arg0);

second.readFields(arg0);

}

@Override

public void write(DataOutput arg0) throws IOException {

// TODO Auto-generated method stub

first.write(arg0);

second.write(arg0);

}

@Override

public int compareTo(LongAndFloat o) {

// TODO Auto-generated method stub

int cmp=first.compareTo(o.first);

if(cmp!=0){

return cmp;

}

return second.compareTo(o.second);

}

二、第二个MR：

输入数据为MR(1) 的输出，只是项目item的相似度，先不管用户ID，直接对后面的所有项目进行拆分。

输出应该类似下面：

Item_id1:Item_id2 次数

101,{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}

102,{106:1.0,105:1.0,104:2.0,103:3.0,102:3.0,101:3.0}

1.main

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

// TODO Auto-generated method stub

Configuration conf1 = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: WiKiDriver2 <in> <out>");

System.exit(2);

}

Job job1 = new Job(conf1, "wiki job two");

job1.setNumReduceTasks(1);

job1.setJarByClass(WiKiDriver2.class);

job1.setInputFormatClass(SequenceFileInputFormat.class);

job1.setMapperClass(WikiMapper2.class);

job1.setMapOutputKeyClass(IntWritable.class);

job1.setMapOutputValueClass(IntWritable.class);

job1.setReducerClass(WiKiReducer2.class);

job1.setOutputKeyClass(IntWritable.class);

job1.setOutputValueClass(VectorWritable.class);

job1.setOutputFormatClass(SequenceFileOutputFormat.class);

SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));

SequenceFileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));

if(!job1.waitForCompletion(true)){

System.exit(1); // run error then exit

}

2.Mapper

publicclass WikiMapper2 extends Mapper<VarLongWritable,VectorWritable,IntWritable,IntWritable>{

public void map(VarLongWritableuserID,VectorWritable userVector,Context context) throws IOException,InterruptedException{

Iterator<Vector.Element>it=userVector.get().iterateNonZero();

while(it.hasNext()){

int index1=it.next().index();

// System.out.println("index1:"+index1);

Iterator<Vector.Element>it2=userVector.get().iterateNonZero();

while(it2.hasNext()){

intindex2=it2.next().index();

// test

/*if(index1==101){

System.out.println("index1:"+index1+",index2:"+index2);

}*/

context.write(newIntWritable(index1), new IntWritable(index2));

}

3.reduce

public class WiKiReducer2 extendsReducer<IntWritable,IntWritable,IntWritable,VectorWritable> {

public void reduce(IntWritable itemIndex1,Iterable<IntWritable>itemPrefs,Context context) throws IOException, InterruptedException{

// RandomAccessSparseVector(intcardinality, int initialCapacity)

Vector itemVector=newRandomAccessSparseVector(Integer.MAX_VALUE,10);

for(IntWritable itemPref:itemPrefs){

intitemIndex2=itemPref.get();

itemVector.set(itemIndex2,itemVector.get(itemIndex2)+1.0);

}

context.write(itemIndex1, new VectorWritable(itemVector));

// System.out.println(itemIndex1+","+itemVector);

}

三、第三个MR：

含有两个Mapper，第一个MR(31)把MR(2)的输出的格式转为VectorOrPrefWritable;

MR2为用户评分矩阵

Item_id user_id:preference

101,{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}

MR(32)针对MR(1)的输出把每一个项目ID和用户ID作为一对进行输出，输出格式也为VectorOrPrefWritable;

MR1生成物品同现矩阵

Item_id1:Item_id2 次数

userid:5,vector:{106:4.0,105:3.5,104:4.0,103:2.0,102:3.0,101:4.0}

VectorOrPrefWritable

input: MR2的输出userVectors

map: 输出：(itemId,VectorOrPrefWritable<userId, pref>)

1.main

public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {

// TODO Auto-generated method stub

Configuration conf1 = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: WiKiDriver31 <in><out>");

System.exit(2);

}

Job job1 = new Job(conf1, "wiki job three1");

job1.setOutputFormatClass(SequenceFileOutputFormat.class);

job1.setInputFormatClass(SequenceFileInputFormat.class);

job1.setNumReduceTasks(1);

job1.setJarByClass(WiKiDriver31.class);

job1.setMapperClass(WikiMapper31.class);

job1.setMapOutputKeyClass(IntWritable.class);

job1.setMapOutputValueClass(VectorOrPrefWritable.class);

// set a reducer only to use SequenceFileOutputFormat

job1.setReducerClass(WiKiReducer31.class);

job1.setOutputKeyClass(IntWritable.class);

job1.setOutputValueClass(VectorOrPrefWritable.class);

// this MR's input is the MR2's output

SequenceFileInputFormat.addInputPath(job1, newPath(PATH+otherArgs[0]));

SequenceFileOutputFormat.setOutputPath(job1, newPath(PATH+otherArgs[1]));

if(!job1.waitForCompletion(true)){

System.exit(1); // run error then exit

}

2map

publicclass WikiMapper31 extends Mapper<IntWritable,VectorWritable,IntWritable,VectorOrPrefWritable>{

public void map(IntWritablekey,VectorWritable value,Context context) throws IOException, InterruptedException{

context.write(key, newVectorOrPrefWritable(value.get()));

// System.out.println("key"+key.toString()+",vlaue"+value.get());

}

3.reduce

public class WiKiReducer31 extends Reducer<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {

public void reduce(IntWritable key,Iterable<VectorOrPrefWritable> values ,Context context ) throws IOException, InterruptedException{

for(VectorOrPrefWritable va:values){

context.write(key, va); }

}

四、第四个MR：

MR4的map不做任何事情；MR4的reduce输出就是把MR(31)和MR(32)的相同的itemID整合一下而已(注意此处的输入为两个路径)：如下：

101 {107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0} [5 1 4 23] [4.0 5.0 5.0 2.0 2.5]

101共现矩阵

101{107:1.0,106:2.0,105:2.0,104:4.0,103:4.0,102:3.0,101:5.0}

101用户评分矩阵

[5 1 4 2 3] [4.0 5.0 5.0 2.0 2.5]

Item_id user_id:preference

101 2:2.0

101 5:4.0

101 4:5.0

101 3:2.0

101 1:5.0

1.main

public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {

// TODO Auto-generated method stub

Configuration conf1 = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();

if (otherArgs.length != 3) {

System.err.println("Usage: WiKiDriver4 <in1><in2><out>");

System.exit(2);

}

Job job1 = new Job(conf1, "wiki job four");

job1.setNumReduceTasks(1);

job1.setJarByClass(WiKiDriver4.class);

job1.setInputFormatClass(SequenceFileInputFormat.class);

job1.setMapperClass(WikiMapper4.class);

job1.setMapOutputKeyClass(IntWritable.class);

job1.setMapOutputValueClass(VectorOrPrefWritable.class);

job1.setReducerClass(WiKiReducer4.class);

job1.setOutputKeyClass(IntWritable.class);

job1.setOutputValueClass(VectorAndPrefsWritable.class);

job1.setOutputFormatClass(SequenceFileOutputFormat.class);

SequenceFileInputFormat.addInputPath(job1, newPath(PATH+otherArgs[0]));

SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[1]));

SequenceFileOutputFormat.setOutputPath(job1, newPath(PATH+otherArgs[2]));

if(!job1.waitForCompletion(true)){

System.exit(1); // run error then exit

}

2.map

public class WikiMapper4 extends Mapper<IntWritable ,VectorOrPrefWritable,IntWritable,VectorOrPrefWritable> {

public void map(IntWritable key,VectorOrPrefWritable value,Context context) throws IOException, InterruptedException{

context.write(key, value);

}

3.reduce

public class WiKiReducer4 extendsReducer<IntWritable,VectorOrPrefWritable,IntWritable,VectorAndPrefsWritable>{

public void reduce(IntWritable key, Iterable<VectorOrPrefWritable>values,Context context) throws IOException, InterruptedException{

List<Long> userfs=new ArrayList<Long>(); //userID

List<Float> prefs=new ArrayList<Float>(); // pref

Vector v=null;

for(VectorOrPrefWritable value:values){

if(value.getVector()!=null){

v=value.getVector();

}else{

userfs.add(value.getUserID());

prefs.add(value.getValue());

}

context.write(key, new VectorAndPrefsWritable(v,userfs,prefs));

//System.out.println("key ,itemid:"+key.toString()+",information:"+v+","+userfs+","+prefs);

}

五、第五个MR：

map:针对MR4的输出的每一行中的每一个用户，用这个用户的评分值(value)去乘以项目之间的相似度向量，比如针对第一条记录中的用户3，则有 Vectorforuser3=[1.0 2.0 2.0 4.0 4.0 3.0 5.0]* 2.5 则map的输出为 key :3 value : Vectorforuser3;

map的输出应该如下所示：

alluserids:[5, 1, 4, 2, 3]

,userid:5,vector:{107:4.0,106:8.0,105:8.0,104:16.0,103:16.0,102:12.0,101:20.0}

,userid:1,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0}

,userid:4,vector:{107:5.0,106:10.0,105:10.0,104:20.0,103:20.0,102:15.0,101:25.0}

,userid:2,vector:{107:2.0,106:4.0,105:4.0,104:8.0,103:8.0,102:6.0,101:10.0}

,userid:3,vector:{107:2.5,106:5.0,105:5.0,104:10.0,103:10.0,102:7.5,101:12.5}

Combine : 针对map的输出，把相同 key(即userID)的向量对应相加，得到的向量和即为该userID的对各个项目的评分；

combine的输出应该如下所示：

userid:1,vecotr:{107:5.0,106:18.0,105:15.5,104:33.5,103:39.0,102:31.5,101:44.0}

userid:2,vecotr:{107:4.0,106:20.5,105:15.5,104:36.0,103:41.5,102:32.5,101:45.5}

Reduce:针对combine的输出，把用户已经评价过分的项目筛选掉，然后按照评分值的大小有大到小排序输出，即为用户推荐项目；

最后的输出为：

1 [104:33.5,106:18.0,105:15.5,107:5.0]

2 [106:20.5,105:15.5,107:4.0]

3 [103:26.5,102:20.0,106:17.5]

4 [102:37.0,105:26.0,107:9.5]

5 [107:11.5]

1.main

public static void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {

Configuration conf1 = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf1,args).getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: WiKiDriver5 <in><out>");

System.exit(2);

}

Job job1 = new Job(conf1, "wiki job five");

job1.setNumReduceTasks(1);

job1.setJarByClass(WiKiDriver5.class);

job1.setInputFormatClass(SequenceFileInputFormat.class);

job1.setMapperClass(WikiMapper5.class);

job1.setMapOutputKeyClass(VarLongWritable.class);

job1.setMapOutputValueClass(VectorWritable.class);

job1.setCombinerClass(WiKiCombiner5.class);

job1.setReducerClass(WiKiReducer5.class);

job1.setOutputKeyClass(VarLongWritable.class);

job1.setOutputValueClass(RecommendedItemsWritable.class);

// job1.setOutputFormatClass(SequenceFileOutputFormat.class);

SequenceFileInputFormat.addInputPath(job1, new Path(PATH+otherArgs[0]));

FileOutputFormat.setOutputPath(job1, new Path(PATH+otherArgs[1]));

if(!job1.waitForCompletion(true)){

System.exit(1); // run error then exit

}

2.Map

publicclass WikiMapper5 extends Mapper<IntWritable,VectorAndPrefsWritable,VarLongWritable,VectorWritable>{

public void map(IntWritablekey,VectorAndPrefsWritable vectorAndPref,Context context) throws IOException,InterruptedException{

Vector coo=vectorAndPref.getVector();

List<Long>userIds=vectorAndPref.getUserIDs();

List<Float>prefValues=vectorAndPref.getValues();

//System.out.println("alluserids:"+userIds);

for(inti=0;i<userIds.size();i++){

long userID=userIds.get(i);

floatprefValue=prefValues.get(i);

Vectorpar=coo.times(prefValue);

context.write(newVarLongWritable(userID), new VectorWritable(par));

//System.out.println(",userid:"+userID+",vector:"+par); // ifthe user id = 3 is the same as my paper then is right

}

3.combine

publicclass WiKiCombiner5 extendsReducer<VarLongWritable,VectorWritable,VarLongWritable,VectorWritable>{

public void reduce(VarLongWritable key,Iterable<VectorWritable> values,Context context) throws IOException,InterruptedException{

Vector partial=null;

for(VectorWritable v:values){

partial=partial==null?v.get():partial.plus(v.get());

}

context.write(key, newVectorWritable(partial));

System.out.println("userid:"+key.toString()+",vecotr:"+partial);// here also should be the same as my paper'sresult

}

4.reduce

publicclass WiKiReducer5 extendsReducer<VarLongWritable,VectorWritable,VarLongWritable,RecommendedItemsWritable>{

private intrecommendationsPerUser=RECOMMENDATIONSPERUSER;

private String path=JOB1OUTPATH;

private staticFastMap<Integer,String> map=new FastMap<Integer,String>();

public void setup(Context context) throwsIOException{

Configuration conf=newConfiguration();

FileSystemfs=FileSystem.get(URI.create(path), conf);

Path tempPath=new Path(path);

SequenceFile.Reader reader=null;

try {

reader=new SequenceFile.Reader(fs,tempPath, conf);

Writablekey=(Writable)ReflectionUtils.newInstance(reader.getKeyClass(),conf);

Writable value = (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);

// long position = reader.getPosition();

while (reader.next(key, value)){

map.put(Integer.parseInt(key.toString()), value.toString());

// System.out.println(key.toString()+","+value.toString());

// position = reader.getPosition(); //beginning of next record

}

} catch (Exception e) {

// TODO Auto-generated catchblock

e.printStackTrace();

}

public void reduce(VarLongWritable key,Iterable<VectorWritable> values,Context context) throws IOException,InterruptedException{

int userID=(int)key.get();

Vector rev=null;

for(VectorWritable vec:values){

rev=rev==null?vec.get():rev.plus(vec.get());

}

Queue<RecommendedItem>topItems=new PriorityQueue<RecommendedItem>(

recommendationsPerUser+1,

Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance())

);

Iterator<Vector.Element>recommendationVectorIterator=

rev.iterateNonZero();

while(recommendationVectorIterator.hasNext()){

Vector.Elemente=recommendationVectorIterator.next();

int index=e.index();

System.out.println("Vecotr.element.indxe:"+index); // test here find the index is itemid or not ** test result : index isitem

if(!hasItem(userID,String.valueOf(index))){

float value=(float)e.get();

if(topItems.size()<recommendationsPerUser){

// here only set index

topItems.add(newGenericRecommendedItem(index,value));

}elseif(value>topItems.peek().getValue()){

topItems.add(newGenericRecommendedItem(index,value));

topItems.poll();

}

List<RecommendedItem>recom=newArrayList<RecommendedItem>(topItems.size());

recom.addAll(topItems);

Collections.sort(recom,ByValueRecommendedItemComparator.getInstance());

context.write(key, new RecommendedItemsWritable(recom));

}

public static boolean hasItem(int user,String item){ // to check whether the user has rate theitem

boolean flag=false;

String items=map.get(user);

if(items.contains(item)){

flag=true;

}

return flag;

}

最后一个reducer的编写基本思路：在Reducer的setup函数中读取SequenceFile的数据，这个数据是MR1的输出数据，用来排除用户已经评价过的项目。

在最后一个Reducer中我也用了一个FastMap,这个类是Mahout的，应该用他提供的一些类会运行的更加快吧。

Thomas__li

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录