(二) 基于物品的CF
了解了 User CF,Mahout Item CF 的实现与 User CF 类似,是基于 ItemSimilarity,下面我们看实现的代码例子,它比 User CF 更简单,因为 Item CF 中并不需要引入邻居的概念:
DataModel model = new FileDataModel(new File("preferences.dat")); ItemSimilarity similarity = new PearsonCorrelationSimilarity(model); Recommender recommender = new GenericItemBasedRecommender(model, similarity);
首先来分析一下GenericItemBasedRecommender这个类,他的功能函数为:
@Override
public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
log.debug("Recommending items for user ID '{}'", userID);
PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
if (preferencesFromUser.length() == 0) {
return Collections.emptyList();
}
//得到其他所有可能的item
FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser);
//创建评估器
TopItems.Estimator<Long> estimator = new Estimator(userID, preferencesFromUser);
//获取评测分数最高的howMany个item返回
List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
estimator);
log.debug("Recommendations are: {}", topItems);
return topItems;
}
1、获取其他可能的items
首先根据该userid用户所评论过的所有item,然后得到每个item对应进行评价过的user列表,然后将对应的每个user所评价过的item都添加到一个集合之中 ,最后将该集合中该userid用户评价过的item去除掉,得到我们想要的集合
protected FastIDSet getAllOtherItems(long userID, PreferenceArray preferencesFromUser) throws TasteException {
return candidateItemsStrategy.getCandidateItems(userID, preferencesFromUser, dataModel);
}
//这里的candidateItemsStrategy的出处
public GenericItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) {
this(dataModel,
similarity,
AbstractRecommender.getDefaultCandidateItemsStrategy(),
getDefaultMostSimilarItemsCandidateItemsStrategy());
}
protected static CandidateItemsStrategy getDefaultCandidateItemsStrategy() {
return new PreferredItemsNeighborhoodCandidateItemsStrategy();
}
public final class PreferredItemsNeighborhoodCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
/*
* 首先根据该user所评论过的所有item,然后得到每个item对应进行评价过的user列表,
* 然后将对应的每个user所评价过的item都添加到一个集合之中
* 最后将该集合中该user评价过的item去除掉,得到我们想要的集合
*
*/
@Override
protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException {
FastIDSet possibleItemsIDs = new FastIDSet();
for (long itemID : preferredItemIDs) {
PreferenceArray itemPreferences = dataModel.getPreferencesForItem(itemID);
int numUsersPreferringItem = itemPreferences.length();
for (int index = 0; index < numUsersPreferringItem; index++) {
possibleItemsIDs.addAll(dataModel.getItemIDsFromUser(itemPreferences.getUserID(index)));
}
}
possibleItemsIDs.removeAll(preferredItemIDs);
return possibleItemsIDs;
}
}
2、创建评估器
将上边得到的可能的item的列表中的每一个item进行评估,一个可能的item与该userid用户所评价过的所有的item进行相似度的计算,最后取平均值,得到的这个值就是对这个item的评估值
private final class Estimator implements TopItems.Estimator<Long> {
@Override
public double estimate(Long itemID) throws TasteException {
return doEstimatePreference(userID, preferencesFromUser, itemID);
}
}
protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID)
throws TasteException {
double preference = 0.0;
double totalSimilarity = 0.0;
int count = 0;
//用userid用户所有评价过的item与itemid的物品进行相似度计算
double[] similarities = similarity.itemSimilarities(itemID, preferencesFromUser.getIDs());
for (int i = 0; i < similarities.length; i++) {
double theSimilarity = similarities[i];
if (!Double.isNaN(theSimilarity)) {
// Weights can be negative!
preference += theSimilarity * preferencesFromUser.getValue(i);
totalSimilarity += theSimilarity;
count++;
}
}
// Throw out the estimate if it was based on no data points, of course, but also if based on
// just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment.
// The reason is that in this case the estimate is, simply, the user's rating for one item
// that happened to have a defined similarity. The similarity score doesn't matter, and that
// seems like a bad situation.
if (count <= 1) {
return Float.NaN;
}
float estimate = (float) (preference / totalSimilarity);
if (capper != null) {
estimate = capper.capEstimate(estimate);
}
return estimate;
}
3、获取评测分数最高的howMany个item返回
将上边可能item列表中的item的得分,都插入到一个优先队列中,保留评估值最高的howMany个item,作为最后的推荐结果返回
public static List<RecommendedItem> getTopItems(int howMany,
LongPrimitiveIterator possibleItemIDs,
IDRescorer rescorer,
Estimator<Long> estimator) throws TasteException {
Preconditions.checkArgument(possibleItemIDs != null, "argument is null");
Preconditions.checkArgument(estimator != null, "argument is null");
Queue<RecommendedItem> topItems = new PriorityQueue<RecommendedItem>(howMany + 1,
Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance()));
boolean full = false;
double lowestTopValue = Double.NEGATIVE_INFINITY;
while (possibleItemIDs.hasNext()) {
long itemID = possibleItemIDs.next();
if (rescorer == null || !rescorer.isFiltered(itemID)) {
double preference;
try {
//得到该item的平均得分作为user的预测评分
preference = estimator.estimate(itemID);
} catch (NoSuchItemException nsie) {
continue;
}
double rescoredPref = rescorer == null ? preference : rescorer.rescore(itemID, preference);
if (!Double.isNaN(rescoredPref) && (!full || rescoredPref > lowestTopValue)) {
topItems.add(new GenericRecommendedItem(itemID, (float) rescoredPref));
if (full) {
topItems.poll();
} else if (topItems.size() > howMany) {
full = true;
topItems.poll();
}
lowestTopValue = topItems.peek().getValue();
}
}
}
int size = topItems.size();
if (size == 0) {
return Collections.emptyList();
}
List<RecommendedItem> result = Lists.newArrayListWithCapacity(size);
result.addAll(topItems);
Collections.sort(result, ByValueRecommendedItemComparator.getInstance());
return result;
}