之前写的代码都是单机上跑的,发现现在很流行hadoop,所以又试着用hadoop mapreduce来处理下决策树的创建。
看了一些mahout在处理决策树和随机森林的过程,大体过程是Job只有一个Mapper处理,在map方法里面做数据的转换收集工作,然后在cleanup方法里面去做决策树的创建过程。然后将决策树序列化到HDFS上面,分类样本数据集的时候,在从HDFS上面取回决策树结构。大体来说,mahout决策树的构建过程好像并没有结合分布式计算,因为我也并没有仔仔细细的去研读mahout里面的源码,所以可能是我没发现。下面是我实现的一个简单hadoop版本决策树,用的C4.5算法,通过MapReduce去计算增益率。最后生成的决策树并未保存在HDFS上面,后面有时间在考虑下吧。下面是具体代码实现:
public class DecisionTreeC45Job extends AbstractJob {
/** 对数据集做准备工作,主要就是将填充好默认值的数据集再次传到HDFS上*/
public String prepare(Data trainData) {
String path = FileUtils.obtainRandomTxtPath();
DataHandler.writeData(path, trainData);
System.out.println(path);
String name = path.substring(path.lastIndexOf(File.separator) + 1);
String hdfsPath = HDFSUtils.HDFS_TEMP_INPUT_URL + name;
HDFSUtils.copyFromLocalFile(conf, path, hdfsPath);
return hdfsPath;
}
/** 选择最佳属性,读取MapReduce计算后产生的文件,取增益率最大*/
public AttributeGainWritable chooseBestAttribute(String output) {
AttributeGainWritable maxAttribute = null;
Path path = new Path(output);
try {
FileSystem fs = path.getFileSystem(conf);
Path[] paths = HDFSUtils.getPathFiles(fs, path);
ShowUtils.print(paths);
double maxGainRatio = 0.0;
SequenceFile.Reader reader = null;
for (Path p : paths) {
reader = new SequenceFile.Reader(fs, p, conf);
Text key = (Text) ReflectionUtils.newInstance(
reader.getKeyClass(), conf);
AttributeGainWritable value = new AttributeGainWritable();
while (reader.next(key, value)) {
double gainRatio = value.getGainRatio();
if (gainRatio >= maxGainRatio) {
maxGainRatio = gainRatio;
maxAttribute = value;
}
value = new AttributeGainWritable();
}
IOUtils.closeQuietly(reader);
}
System.out.println("output: " + path.toString());
HDFSUtils.delete(conf, path);
System.out.println("hdfs delete file : " + path.toString());
} catch (IOException e) {
e.printStackTrace();
}
return maxAttribute;
}
/** 构造决策树 */
public Object build(String input, Data data) {
Object preHandleResult = preHandle(data);
if (null != preHandleResult) return preHandleResult;
String output = HDFSUtils.HDFS_TEMP_OUTPUT_URL;
HDFSUtils.delete(conf, new Path(output));
System.out.println("delete output path : " + output);
String[] paths = new String[]{input, output};
//通过MapReduce计算增益率
CalculateC45GainRatioMR.main(paths);
AttributeGainWritable bestAttr = choo