模仿一些大神,盗版了一个版本
原始数据的实例,1,0代表这些数据的标签,也就是数据按照1,0分类
1:B,C,D,F,E,L
0:A,C,E,K
1:F,A,D,I
写了4部分 第一部分 算出每个标签下面的单词总数的统计 输出结果样式 0 12 1 19意思是标签的单词有12个,1标签下面的单词有19个
第二部分 算出一共要有多少个不同的单词 结果输出样式 sum 20 意思是数据里面一共有20个不重复的单词
第三部分 算出每个单词在每个标下面的条件概率 输出样式
0,M 0.044444444444444446
0,P 0.044444444444444446
0,T 0.044444444444444446
1,A 0.08695652173913043
1,B 0.08695652173913043
第四部分写一个单词组,可以自己随便写一个A,D,V,C 输出结果样式为
0 A 1 D 1V 1 C
package naive;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class fenlei {
static class fenleiMapper extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split(":");
// int size = line.length - 1;
String[] line1 = line[1].split(",");
int a = line1.length;
// System.out.println("===============================");
// System.out.println(a);
// System.out.println("===============================");
//取出原始数据中每行单词的个数
//现在输出的就是标签,每个标签的单词个数
context.write(new Text(line[0]), new Text(String.valueOf(a)));
}
}
static class fenleiReducer extends Reducer<Text, Text, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (Text val:values){
//values里面存放的是单词个数
sum += Integer.parseInt(val.toString());
System.out.println("===============================");
System.out.println(sum);
System.out.println("===============================");
}
context.write(key, new IntWritable(sum));
//输出标签和每个标签下面的单词数
}
}
//主函数
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// String input = "C:\fenlei.txt";
// String output = "C:\fenlei";
Job job = Job.getInstance(conf,"fenlei");
job.setJarByClass(fenlei.class);
job.setMapperClass(fenleiMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(fenleiReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
第二部分
package naive;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.LineReader;
public class jieguo {
public static class jieguoMapper extends Mapper<LongWritable, Text, Text, Text>{
public Map<String,Integer> map = new HashMap();
@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
Configuration conf=context.getConfiguration();
String proPath=conf.get("proPath");
try {
map=Utils1.getMapFormHDFS(proPath);
//获取到0下面的单词数量和1下面的单词数量
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
protected void map(LongWritable ikey, Text ivalue, Context context)
throws IOException, InterruptedException {
for(Map.Entry<String,Integer> entry:map.entrySet()){
System.out.println("=======map的输入输出======");
System.out.println(entry.getKey().toString() + "==="+ ivalue.toString());
System.out.println("=======map的输入输出======");
context.write(new Text(entry.getKey()),ivalue);
}
}
}
public static class jieguoReducer extends Reducer<Text, Text, Text, Text>{
public Map<String,Double> mapDouble=new HashMap();//存放条件概率
public Map<String,Integer> mapInteger=new HashMap();//存放各个类别下的单词数
public Map<String,Double> noFind=new HashMap();//用于那些单词没有出现在某个类别中的
public Map<String,Double> prePro=new HashMap();//求的后的先验概率
// System.out.println(pro);
// System.out.println("=======reducdr从这里======");
@Override
protected void setup(Context context) throws IOException, InterruptedException {
System.out.println("=======reduce从这里开始跑的======");
System.out.println("=======reduce从这里开始跑的======");
Configuration conf=context.getConfiguration();
String condiProPath=conf.get("condiProPath");
String proPath=conf.get("proPath");
String countPath=conf.get("countPath");
System.out.println(countPath);
mapDouble=Utils1.getMapFormHDFS(condiProPath,true);
//获取到已经计算过的条件概率
System.out.println("打印的一个地址==" + countPath);
System.out.println("打印的mapdouble的值===="+ mapDouble);
// System.out.println("打印的mapdouble的值===="+ mapDouble.get("0:A"));
// System.out.println("=======reducer======");
// System.out.println("=======reduce从这里结束======");
// System.out.println("=======reduce从这里结束======");
try {
mapInteger=Utils1.getMapFormHDFS(proPath);
//获取到0下面的单词数量和1下面的单词数量
} catch (Exception e) {
}
int count=Utils1.getCountFromHDFS(countPath);
//获取到总的单词数量
for(Map.Entry<String,Integer> entry:mapInteger.entrySet()){
double pro=0.0;
noFind.put(entry.getKey(),(1.0/(count+entry.getValue())));
}
int sum=0;
for(Map.Entry<String,Integer> entry:mapInteger.entrySet()){
sum+=entry.getValue();
}
for(Map.Entry<String,Integer> entry:mapInteger.entrySet()){
prePro.put(entry.getKey(),(entry.getValue()*1.0/sum));
}
}
@Override
protected void reduce(Text _key, Iterable<Text> values, Context context1) throws IOException, InterruptedException {
String type=_key.toString();
double pro=1.0;
for (Text val : values) {
String[] words=val.toString().split(",");
System.out.println("打印的words的值====");
System.out.println("============="+words[0]+"===============");
System.out.println("=======reduce从这里结束======");
System.out.println("=======reduce从这里结束======");
for(int i=0;i<words.length;i++){
// String condi=type+":"+words[i];
String aaa="1"+":"+words[i];
String bbb="0"+":"+words[i];
double pro1 = 0;
double pro2 = 0;
if(mapDouble.get(aaa)!=null){//如果该单词出现在该类别中,说明有条件概率
pro1=pro*mapDouble.get(aaa);
System.out.println("打印的mapDouble.get(condi)的值");
System.out.println("============="+mapDouble.get(aaa)+"===============");
// System.out.println("打印的mapDouble.get(condi)的值");
}else if(mapDouble.get(bbb)!=null){
//如果该单词不在该类别中,就采用默认的条件概率
pro2=pro*mapDouble.get(bbb);
}else{
pro=pro*noFind.get(type);
}
// String aaa="1"+":"+words[i];
// String bbb="0"+":"+words[i];
// double pro1 = pro*prePro.get(aaa);
// double pro2 = pro*prePro.get(bbb);
// System.out.println(pro1);
// System.out.println(pro2);
System.out.println("=======reducer======");
if(pro1>pro2){
context1.write(new Text("0"), new Text(words[i]));
}else
{
context1.write(new Text("1"), new Text(words[i]));
}
// context1.write(new Text(type), new DoubleWritable(pro1));
}
}
// System.out.println("=======reducer======");
// double pro1 = pro*prePro.get("1");
// double pro2 = pro*prePro.get("0");
// System.out.println(pro1);
// System.out.println(pro2);
// System.out.println("=======reducer======");
// if(pro1<pro2){
// context1.write(new Text(1), );
// }
//
// context1.write(new Text(type), new DoubleWritable(pro1));
}
}
public static void main(String[] args) throws Exception {//预测
Configuration conf = new Configuration();
// String input="hdfs://10.107.8.110:9000/Bayes/Predict_input";
// String output="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Predict";
// String condiProPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Con";
// String proPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Pro";
// String countPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Count";
// String input="hdfs://10.107.8.110:9000/Bayes/Bayes_input";
// String output="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Con";
// String proPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Pro";//这是之前求各个类别下单词数目的输出
// String countPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Count";
// String input="C:/fenlei.txt";
String input="C:/naive/shiyan/shiyan.txt";
String output="C:/naivedeooooo";
String condiProPath="C:/abc.txt";
String proPath="C:/naive/fenlei/a.txt";//这是之前求各个类别下单词数目的输出
String countPath="C:/naive/tiaojian/b.txt";//这是之前求的单词种类数 */
conf.set("condiProPath",condiProPath);
conf.set("proPath",proPath);
conf.set("countPath",countPath);
Job job = Job.getInstance(conf, "Predict");
job.setJarByClass(jieguo.class);
// TODO: specify a mapper
job.setMapperClass(jieguoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// TODO: specify a reducer
job.setReducerClass(jieguoReducer.class);
// TODO: specify output types
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
job.waitForCompletion(true);
}
}
class Utils1{
public static Map<String,Integer> getMapFormHDFS(String input) throws Exception{
Configuration conf = new Configuration();
Path path = new Path(input);
FileSystem fs = path.getFileSystem(conf);
FileStatus[] status = fs.listStatus(path);
Map<String,Integer> map = new HashMap();
for(int i= 0;i < status.length;i++){
if(status[i].isFile()){
FSDataInputStream infs = fs.open(status[i].getPath());
LineReader reader = new LineReader(infs,conf);
Text line = new Text();
while (reader.readLine(line) > 0){
String[] temp = line.toString().split(",");
// System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
// System.out.println("取map里面的数据的数组");
// System.out.println(temp);
// System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
map.put(temp[0].toString(), Integer.parseInt(temp[1]));
// map.put(new String("1"), 10);
}
reader.close();
}
}
return map;
}
public static Map<String,Double> getMapFormHDFS(String input,boolean j) throws IOException{
Configuration conf=new Configuration();
Path path=new Path(input);
FileSystem fs=path.getFileSystem(conf);
FileStatus[] stats=fs.listStatus(path);
Map<String,Double> map=new HashMap();
for(int i=0;i<stats.length;i++){
if(stats[i].isFile()){
FSDataInputStream infs=fs.open(stats[i].getPath());
LineReader reader=new LineReader(infs,conf);
Text line=new Text();
while(reader.readLine(line)>0){
String[] temp1=line.toString().split(",");
String[] temp = temp1[1].toString().split(" ");
// System.out.println(temp1.length);
//System.out.println(temp.length);
// System.out.println("111111111111111111111111111111111111111111111111111111111111111111111");
// System.out.println(temp[1]);
String mapget = temp1[0]+":"+temp[0];
map.put(mapget,Double.parseDouble(temp[1]));
// System.out.println(map);
// System.out.println("111111111111111111111111111111111111111111111111111111111111111111111");
}
reader.close();
}
}
return map;
}
public static int getCountFromHDFS(String input) throws IOException{
Configuration conf=new Configuration();
Path path=new Path(input);
FileSystem fs=path.getFileSystem(conf);
FileStatus[] stats=fs.listStatus(path);
int count=0;
for(int i=0;i<stats.length;i++){
if(stats[i].isFile()){
FSDataInputStream infs=fs.open(stats[i].getPath());
LineReader reader=new LineReader(infs,conf);
Text line=new Text();
while(reader.readLine(line)>0){
String[] temp=line.toString().split(",");
// System.out.println(temp[0]);
// String[] temp1 = temp.toString().split(" ");
// System.out.println("=*****************=");
// System.out.println("取count里面的数据的数组");
// for(int n = 0;n < temp.length ; n++){
// System.out.println(temp[0]);
// }
// System.out.println(temp1[1]);
count=Integer.parseInt(temp[1]);
// System.out.println(temp[1]);
// count = Integer.valueOf(temp1[1]).intValue();
// System.out.println("=****************=");
//count = 20;
}
reader.close();
}
}
return count;
}
}
第三部分
package naive;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class jisuan {
public static class jisuanMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split(":" );
//处理成2部分,标签 单词
String[] line1 = line[1].split(",");
for(int i=0; i < line1.length;i++){
String key1 = line[0] + ":" +line1[i];
System.out.println("===============================");
System.out.println(line1.length+"--"+line[0] + ":" +line1[i]);
System.out.println("===============================");
context.write(new Text(key1), new IntWritable(1));
}
}
}
public static class jisuanReducer extends Reducer<Text, IntWritable, Text, DoubleWritable>{
public Map<String,Integer> map ;
public int count;
// System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String proPath = conf.get("propath");
System.out.println(proPath);
String countPath = conf.get("countPath");
System.out.println(countPath);
try {
map = Utils.getMapFormHDFS(proPath);
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
// try {
// map = Utils.getMapFormHDFS(proPath);
// System.out.println(map);
//获取各个类别下的单词数
// } catch (Exception e) {
System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
//
// }
// System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
// System.out.println(" ");
// System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
count = Utils.getCountFromHDFS(countPath);
// System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
// System.out.println(count);
// System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
//获取单词种类数
}
protected void reduce(Text _key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
//从map那里得到的是 <标签:单词 1>
int sum = 0;
// System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
for(IntWritable val :values){
sum += val.get();
//<标签:单词 单词个数>
}
int type = Integer.parseInt(_key.toString().split(":")[0]);
//获取每个类别标签
double probability = 0.0;
for(Map.Entry<String, Integer> entry:map.entrySet()){
//这个方法是去每个对应标签下面的每个单词
//map是每个标签下面的单词数
if(type == Integer.parseInt(entry.getKey())){
probability=(sum+1)*1.0/(entry.getValue()+count);
//计算的条件概率
}
}
// System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
// System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
// System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
context.write(_key, new DoubleWritable(probability));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String input="C:/fenlei.txt";
String output="C:/naiveaaaa";
String proPath="C:/naive/fenlei/a.txt";//这是之前求各个类别下单词数目的输出
String countPath="C:/naive/tiaojian/b.txt";//这是之前求的单词种类数
conf.set("propath",proPath);
conf.set("countPath",countPath);
Job job = Job.getInstance(conf, "ConditionPro");
job.setJarByClass(jisuan.class);
job.setMapperClass(jisuanMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(jisuanReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
job.waitForCompletion(true);
}
}
class Utils{
public static Map<String,Integer> getMapFormHDFS(String input) throws Exception{
Configuration conf = new Configuration();
Path path = new Path(input);
FileSystem fs = path.getFileSystem(conf);
FileStatus[] status = fs.listStatus(path);
Map<String,Integer> map = new HashMap();
for(int i= 0;i < status.length;i++){
if(status[i].isFile()){
FSDataInputStream infs = fs.open(status[i].getPath());
LineReader reader = new LineReader(infs,conf);
Text line = new Text();
while (reader.readLine(line) > 0){
String[] temp = line.toString().split(",");
System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
System.out.println("取map里面的数据的数组");
System.out.println(temp);
System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
map.put(temp[0].toString(), Integer.parseInt(temp[1]));
// map.put(new String("1"), 10);
}
reader.close();
}
}
return map;
}
public static Map<String,Double> getMapFormHDFS(String input,boolean j) throws IOException{
Configuration conf=new Configuration();
Path path=new Path(input);
FileSystem fs=path.getFileSystem(conf);
FileStatus[] stats=fs.listStatus(path);
Map<String,Double> map=new HashMap();
for(int i=0;i<stats.length;i++){
if(stats[i].isFile()){
FSDataInputStream infs=fs.open(stats[i].getPath());
LineReader reader=new LineReader(infs,conf);
Text line=new Text();
while(reader.readLine(line)>0){
String[] temp=line.toString().split(",");
//System.out.println(temp.length);
map.put(temp[0],Double.parseDouble(temp[1]));
}
reader.close();
}
}
return map;
}
public static int getCountFromHDFS(String input) throws IOException{
Configuration conf=new Configuration();
Path path=new Path(input);
FileSystem fs=path.getFileSystem(conf);
FileStatus[] stats=fs.listStatus(path);
int count=0;
for(int i=0;i<stats.length;i++){
if(stats[i].isFile()){
FSDataInputStream infs=fs.open(stats[i].getPath());
LineReader reader=new LineReader(infs,conf);
Text line=new Text();
while(reader.readLine(line)>0){
String[] temp=line.toString().split(",");
// System.out.println(temp[0]);
// String[] temp1 = temp.toString().split(" ");
System.out.println("=*****************=");
System.out.println("取count里面的数据的数组");
// for(int n = 0;n < temp.length ; n++){
// System.out.println(temp[0]);
// }
// System.out.println(temp1[1]);
count=Integer.parseInt(temp[1]);
System.out.println(temp[1]);
// count = Integer.valueOf(temp1[1]).intValue();
System.out.println("=****************=");
//count = 20;
}
reader.close();
}
}
return count;
}
}
第四部分
package naive;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class tiaojian {
public static class tiaojianMapper extends Mapper<LongWritable, Text,Text, Text>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] line = value.toString().split(":");
String[] line1 = line[1].split(",");
String key1 = "1";
for(int i = 1; i < line1.length; i++){
context.write(new Text(key1), new Text(line1[i]));
}
}
}
public static class tiaojianCombine extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text _key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Set set = new HashSet();
for(Text val :values){
set.add(val.toString());
}
for(Iterator it = set.iterator();it.hasNext();){
context.write(new Text("1"), new Text(it.next().toString()));
}
}
}
public static class tiaojianReducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text _key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Set set = new HashSet();
for(Text val:values){
set.add(val.toString());
}
context.write(new Text("num is"), new Text(String.valueOf(set.size())));
}
}
public static void main(String[] args) throws Exception, Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Count");
// String input="hdfs://10.107.8.110:9000/Bayes/Bayes_input";
// String output="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Count";
job.setJarByClass(tiaojian.class);
job.setMapperClass(tiaojianMapper.class);
job.setCombinerClass(tiaojianCombine.class);
job.setReducerClass(tiaojianReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}