mahout0.8里面在hadoop下的item里和pesudo里各有一个RecommenderJob。不同的是一个是完全分布式的实现,另外一个是伪分布式的实现(而且已经Deprecated了)。
pesudo方式的实现比较简单。
public final class RecommenderJob extends AbstractJob {
@Override
public int <span style="color:#ff6666;">run</span>(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
addInputOption();
addOutputOption();
addOption("recommenderClassName", "r", "Name of recommender class to instantiate");
addOption("numRecommendations", "n", "Number of recommendations per user", "10");
addOption("usersFile", "u", "File of users to recommend for", null);
Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Path inputFile = getInputPath();
Path outputPath = getOutputPath();
Path usersFile = hasOption("usersFile") ? new Path(getOption("usersFile")) : inputFile;
String recommendClassName = getOption("recommenderClassName");
int recommendationsPerUser = Integer.parseInt(getOption("numRecommendations"));
Job job = <span style="color:#ff6666;">prepareJob</span>(usersFile,
outputPath,
TextInputFormat.class,
UserIDsMapper.class,
VarLongWritable.class,
NullWritable.class,
RecommenderReducer.class,
VarLongWritable.class,
RecommendedItemsWritable.class,
TextOutputFormat.class);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
Configuration jobConf = job.getConfiguration();
jobConf.set(RecommenderReducer.RECOMMENDER_CLASS_NAME, recommendClassName);
jobConf.setInt(RecommenderReducer.RECOMMENDATIONS_PER_USER, recommendationsPerUser);
jobConf.set(RecommenderReducer.DATA_MODEL_FILE, inputFile.toString());
boolean succeeded = job.waitForCompletion(true);
return succeeded ? 0 : -1;
}
public static void main(String[] args) throws Exception {
<span style="color:#ff6666;">ToolRunner</span>.run(new RecommenderJob(), args);
}
}
public final class RecommenderJob extends AbstractJob 继承了mahout里的org.apache.mahout.common.AbstractJob;类,来简化Job的设置。
public abstract class AbstractJob extends Configured implements Tool ,
如果要添加自己的参数,可以直接在run方法中添加即可,同时调用prepareJob方法可以简化Job的参数设置。比如设置Mapper、MapperOutPutKey等等都要一行代码,现在全部只需一行即可;如果要设置参数以供Mapper和Reducer中使用,可以使用job.getConfiguration().set("sc", sc)来进行设置。
prepareJob()函数的参数列表:(输入路径, 输出路径, Mapper类,Mapper的key,Mapper的value,reducer类,reducer的key,reducer的value)
其实现了org.apache.hadoop.util.Tool,所以最后通过ToolRunner来执行。ToolRunner的原理见:http://hnote.org/big-data/hadoop/hadoop-tool-toolrunner
基本就是一个job,里面的map是UserIDsMapper,reducer是RecommenderReducer。
而完全分布式的实现就复杂多了。
public final class RecommenderJob extends AbstractJob {
@Override
public int run(String[] args) throws Exception {
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
"--input", getInputPath().toString(),
"--output", prepPath.toString(),
"--maxPrefsPerUser", String.valueOf(maxPrefsPerUserInItemSimilarity),
"--minPrefsPerUser", String.valueOf(minPrefsPerUser),
"--booleanData", String.valueOf(booleanData),
"--tempDir", getTempPath().toString(),
});
numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
}
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
/* special behavior if phase 1 is skipped */
if (numberOfUsers == -1) {
numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
PathType.LIST, null, getConf());
}
/* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
* new DistributedRowMatrix(...).rowSimilarity(...) */
//calculate the co-occurrence matrix 计算共现矩阵
ToolRunner.run(getConf(), new <span style="color:#ff0000;">RowSimilarityJob()</span>, new String[]{
"--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
"--output", similarityMatrixPath.toString(),
"--numberOfColumns", String.valueOf(numberOfUsers),
"--similarityClassname", similarityClassname,
"--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),
"--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
"--threshold", String.valueOf(threshold),
"--tempDir", getTempPath().toString(),
});
// write out the similarity matrix if the user specified that behavior输出相似度矩阵
if (hasOption("outputPathForSimilarityMatrix")) {
Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));
Job outputSimilarityMatrix = <span style="color:#ff0000;">prepareJob</span>(similarityMatrixPath, outputPathForSimilarityMatrix,
SequenceFileInputFormat.class, <span style="color:#3333ff;">ItemSimilarityJob.MostSimilarItemPairsMapper</span>.class,
EntityEntityWritable.class, DoubleWritable.class,<span style="color:#3366ff;"> ItemSimilarityJob.MostSimilarItemPairsReducer</span>.class,
EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
outputSimilarityMatrix.waitForCompletion(true);
}
}
//start the multiplication of the co-occurrence matrix by the user vectors
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job partialMultiply = new Job(getConf(), "partialMultiply");
Configuration partialMultiplyConf = partialMultiply.getConfiguration();
MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,
SimilarityMatrixRowWrapperMapper.class);
MultipleInputs.addInputPath(partialMultiply, new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
SequenceFileInputFormat.class, UserVectorSplitterMapper.class);
partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);
partialMultiply.setMapOutputKeyClass(VarIntWritable.class);
partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);
partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);
partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);
partialMultiply.setOutputKeyClass(VarIntWritable.class);
partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);
partialMultiplyConf.setBoolean("mapred.compress.map.output", true);
partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());
if (usersFile != null) {
partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);
}
partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);
boolean succeeded = partialMultiply.waitForCompletion(true);
if (!succeeded) {
return -1;
}
}
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
//filter out any users we don't care about
/* convert the user/item pairs to filter if a filterfile has been specified */
if (filterFile != null) {
Job itemFiltering = <span style="color:#ff0000;">prepareJob</span>(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
SequenceFileOutputFormat.class);
boolean succeeded = itemFiltering.waitForCompletion(true);
if (!succeeded) {
return -1;
}
}
String aggregateAndRecommendInput = partialMultiplyPath.toString();
if (filterFile != null) {
aggregateAndRecommendInput += "," + explicitFilterPath;
}
//extract out the recommendations
Job aggregateAndRecommend = prepareJob(
new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,
PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,
AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,
TextOutputFormat.class);
Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
if (itemsFile != null) {
aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
}
if (filterFile != null) {
setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath);
}
setIOSort(aggregateAndRecommend);
aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
if (!succeeded) {
return -1;
}
}
return 0;
}
private static void setIOSort(JobContext job) {
Configuration conf = job.getConfiguration();
conf.setInt("io.sort.factor", 100);
String javaOpts = conf.get("mapred.map.child.java.opts"); // new arg name
if (javaOpts == null) {
javaOpts = conf.get("mapred.child.java.opts"); // old arg name
}
int assumedHeapSize = 512;
if (javaOpts != null) {
Matcher m = Pattern.compile("-Xmx([0-9]+)([mMgG])").matcher(javaOpts);
if (m.find()) {
assumedHeapSize = Integer.parseInt(m.group(1));
String megabyteOrGigabyte = m.group(2);
if ("g".equalsIgnoreCase(megabyteOrGigabyte)) {
assumedHeapSize *= 1024;
}
}
}
// Cap this at 1024MB now; see https://issues.apache.org/jira/browse/MAPREDUCE-2308
conf.setInt("io.sort.mb", Math.min(assumedHeapSize / 2, 1024));
// For some reason the Merger doesn't report status for a long time; increase
// timeout when running these jobs
conf.setInt("mapred.task.timeout", 60 * 60 * 1000);
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new RecommenderJob(), args);
}
}
第一个执行的是ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{....})
其中getConf()是获取hadoop的配置(Configuration)信息。
Hadoop没有使用java.util.Properties管理配置文件,也没有使用Apache Jakarta Commons Configuration管理配置文件,而是使用了一套独有的配置文件管理系统,并提供自己的API,即使用org.apache.hadoop.conf.Configuration处理配置信息。Hadoop配置文件采用XML格式,下面是Hadoop配置文件的一个例子