【mahout决策树算法】1-生成Describe

最新推荐文章于 2020-04-24 17:27:23 发布

我是asha

最新推荐文章于 2020-04-24 17:27:23 发布

阅读量4.7k

点赞数 1

分类专栏： mahout 文章标签： mahout decisionfores

本文链接：https://blog.csdn.net/ashqal/article/details/20915431

版权

mahout 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

因为论文的关系，需要学习随机森林算法，老板告诉我mahout已经实现了这个算法，

那么就开始看mahout的决策树实现吧！

首先搭建配置mahout环境，这个就不细说了，大家各种参考网上的

本例按照此https://cwiki.apache.org/confluence/display/MAHOUT/Partial+Implementation文档步骤进行学习

下载此文档中提到的http://nsl.cs.unb.ca/NSL-KDD/

KDDTrain+.ARFF和KDDTest+.ARFF

一个是训练集，一个是测试集，大小都比较小，只有20M不到

下载完后放到hdfs下

结合http://blog.csdn.net/fansy1990/article/category/1313502的代码解析，

决策树生成三部曲，第一步：生成Describe！

使用org.apache.mahout.classifier.df.tools.Describe来生成

package df;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.classifier.df.data.DescriptorException;
import org.apache.mahout.classifier.df.tools.Describe;
import org.apache.mahout.common.HadoopUtil;

import java.io.IOException;
import java.util.Arrays;

/**
 * Created by ashqal on 14-3-7.
 */
public class Step1Describe01 {
    public Step1Describe01() throws IOException, DescriptorException {

        //-p testdata/KDDTrain+.arff -f testdata/KDDTrain+.info

        // N 3 C 2 N C 4 N C 8 N 2 C 19 N L

        // -d N 3 C 2 N C 4 N C 8 N 2 C 19 N L

        String[] arg = new String[]{
                "-p","hdfs://localhost:9000/user/ashqal/HadoopMaven/input//KDDTrain+.arff"
                ,"-f","hdfs://localhost:9000/user/ashqal/HadoopMaven/output/KDDTrain+.info"
                ,"-d","N" ,"3", "C", "2" ,"N" ,"C" ,"4" ,"N" ,"C" ,"8" ,"N" ,"2" ,"C" ,"19" ,"N" ,"L"
        };
        //  System.out.println(arg[Arrays.asList(arg).indexOf("-f")+1]);
        HadoopUtil.delete(new Configuration(), new Path(arg[Arrays.asList(arg).indexOf("-f") + 1]));
        Describe.main(arg);
    }

    public static void main(String[] args) throws IOException, DescriptorException {
        new Step1Describe01();
    }
}

那么这一步其实就是做了个简单的工作

扫描训练集中所有训练数据可能出现的值，然后转化为json描述

比如

0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,229,10,0.00,0.00,1.00,1.00,0.04,0.06,0.00,255,10,0.04,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,136,1,0.00,0.00,1.00,1.00,0.01,0.06,0.00,255,1,0.00,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
2,tcp,ftp_data,SF,12983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,134,86,0.61,0.04,0.61,0.02,0.00,0.00,0.00,0.00,normal
0,icmp,eco_i,SF,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,0.00,0.00,0.00,0.00,1.00,0.00,1.00,3,57,1.00,0.00,1.00,0.28,0.00,0.00,0.00,0.00,anomaly

第二项结果为[value:{tcp,icmp],"label":false,"type":"categorical"}

运行完后输出结果KDDTrain+.info为

[
	{"values":null,"label":false,"type":"numerical"}
	,{"values":["icmp","udp","tcp"],"label":false,"type":"categorical"}
	,{"values":["vmnet","shell","smtp","ntp_u","kshell","aol","imap4","urh_i","netbios_ssn","tftp_u","uucp","mtp","nnsp","echo","tim_i","ssh","iso_tsap","time","netbios_ns","systat","login","hostnames","efs","supdup","http_8001","courier","ctf","finger","nntp","ftp_data","red_i","ldap","http","pm_dump","ftp","exec","klogin","auth","netbios_dgm","other","link","X11","discard","private","remote_job","IRC","pop_3","daytime","pop_2","gopher","sunrpc","rje","name","domain","uucp_path","http_2784","Z39_50","domain_u","csnet_ns","eco_i","whois","bgp","sql_net","printer","telnet","ecr_i","urp_i","netstat","http_443","harvest"],"label":false,"type":"categorical"}
	,{"values":["S3","RSTR","SF","RSTO","SH","OTH","S2","RSTOS0","S1","REJ","S0"],"label":false,"type":"categorical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":["1","0"],"label":false,"type":"categorical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":["1","0"],"label":false,"type":"categorical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":["1","0"],"label":false,"type":"categorical"}
	,{"values":["1","0"],"label":false,"type":"categorical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":null,"label":false,"type":"numerical"}
	,{"values":["normal","anomaly"],"label":true,"type":"categorical"}
]

第一步大功告成！

ps:

the tool searches for an existing infos file (must be filled by the user), in the same directory of the dataset with the same name and with the ".infos" extension, that contain the type of the attributes:
'N' numerical attribute
'C' categorical attribute
'L' label (this also a categorical attribute)
'I' to ignore the attribute
each attribute is in a separate
A Hadoop job is used to parse the dataset and collect the informations. This means that the dataset can be distributed over HDFS.
the results are written back in the same .info file, with the correct format needed by CDGA.