朴素贝叶斯的mapreduce的java实现

模仿一些大神,盗版了一个版本

原始数据的实例,1,0代表这些数据的标签,也就是数据按照1,0分类

1:B,C,D,F,E,L
0:A,C,E,K
1:F,A,D,I

写了4部分 第一部分 算出每个标签下面的单词总数的统计 输出结果样式 0 12 1 19意思是标签的单词有12个,1标签下面的单词有19个

第二部分 算出一共要有多少个不同的单词  结果输出样式 sum  20 意思是数据里面一共有20个不重复的单词

第三部分 算出每个单词在每个标下面的条件概率 输出样式  

0,M 0.044444444444444446
0,P 0.044444444444444446
0,T 0.044444444444444446
1,A 0.08695652173913043
1,B 0.08695652173913043

第四部分写一个单词组,可以自己随便写一个A,D,V,C  输出结果样式为

0 A  1 D 1V 1 C 

package naive;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class fenlei {
	
	  static class fenleiMapper extends Mapper<LongWritable, Text, Text, Text>{
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] line = value.toString().split(":");
//			int size = line.length - 1;
			String[] line1 = line[1].split(",");
			int a = line1.length;
//			System.out.println("===============================");
//			System.out.println(a);
//			System.out.println("===============================");
			//取出原始数据中每行单词的个数
			//现在输出的就是标签,每个标签的单词个数
			context.write(new Text(line[0]), new Text(String.valueOf(a)));
			
		}
	}
	
	 static class fenleiReducer extends Reducer<Text, Text, Text, IntWritable>{
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			int sum = 0;
			for (Text val:values){
				//values里面存放的是单词个数
				sum += Integer.parseInt(val.toString());
				System.out.println("===============================");
				System.out.println(sum);
				System.out.println("===============================");
			}
			context.write(key, new IntWritable(sum));
			//输出标签和每个标签下面的单词数
		}
	}
	
	//主函数
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		String input = "C:\fenlei.txt";
//		String output = "C:\fenlei";
		Job job = Job.getInstance(conf,"fenlei");
		job.setJarByClass(fenlei.class);
		job.setMapperClass(fenleiMapper.class);
    
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setReducerClass(fenleiReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
		
	}
	

}


第二部分

package naive;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.LineReader;

public class jieguo {
	
	public static class jieguoMapper extends Mapper<LongWritable, Text, Text, Text>{
		public Map<String,Integer> map = new HashMap();
		
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			Configuration conf=context.getConfiguration();
	        String proPath=conf.get("proPath");
	        try {
				map=Utils1.getMapFormHDFS(proPath);
				//获取到0下面的单词数量和1下面的单词数量
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			
		}
		
		@Override
		protected void map(LongWritable ikey, Text ivalue, Context context)
				throws IOException, InterruptedException {
			for(Map.Entry<String,Integer> entry:map.entrySet()){
				System.out.println("=======map的输入输出======");
		        System.out.println(entry.getKey().toString()  + "==="+ ivalue.toString());
		        System.out.println("=======map的输入输出======");
	            context.write(new Text(entry.getKey()),ivalue);
			}
		}
		
	}
		
	public static class jieguoReducer extends Reducer<Text, Text, Text, Text>{
			public Map<String,Double> mapDouble=new HashMap();//存放条件概率
		    
		    public Map<String,Integer> mapInteger=new HashMap();//存放各个类别下的单词数
		    
		    public Map<String,Double> noFind=new HashMap();//用于那些单词没有出现在某个类别中的
		    
		    public Map<String,Double> prePro=new HashMap();//求的后的先验概率
		    
		    
//	        System.out.println(pro);
//	        System.out.println("=======reducdr从这里======");
		    @Override
		    protected void setup(Context context) throws IOException, InterruptedException {
		    	System.out.println("=======reduce从这里开始跑的======");
		    	System.out.println("=======reduce从这里开始跑的======");
		    	
		    	Configuration conf=context.getConfiguration();
		        
		        String condiProPath=conf.get("condiProPath");
		        String proPath=conf.get("proPath");
		        String countPath=conf.get("countPath");
		        System.out.println(countPath);
		        mapDouble=Utils1.getMapFormHDFS(condiProPath,true);
		        //获取到已经计算过的条件概率
		        System.out.println("打印的一个地址==" + countPath);
		        System.out.println("打印的mapdouble的值===="+ mapDouble);
//		        System.out.println("打印的mapdouble的值===="+ mapDouble.get("0:A"));
//		        System.out.println("=======reducer======");
//		        System.out.println("=======reduce从这里结束======");
//		    	System.out.println("=======reduce从这里结束======");
		        try {
					mapInteger=Utils1.getMapFormHDFS(proPath);
					//获取到0下面的单词数量和1下面的单词数量
				} catch (Exception e) {
					
				}
		        int count=Utils1.getCountFromHDFS(countPath);
		        //获取到总的单词数量
		        for(Map.Entry<String,Integer> entry:mapInteger.entrySet()){
		            double pro=0.0;
		            noFind.put(entry.getKey(),(1.0/(count+entry.getValue())));
		        }
		        int sum=0;
		        for(Map.Entry<String,Integer> entry:mapInteger.entrySet()){
		            sum+=entry.getValue();
		        }
		        
		        for(Map.Entry<String,Integer> entry:mapInteger.entrySet()){
		            prePro.put(entry.getKey(),(entry.getValue()*1.0/sum));
		        }
		    }
		    
		    @Override
		    protected void reduce(Text _key, Iterable<Text> values, Context context1) throws IOException, InterruptedException {
		    	String type=_key.toString();
		        double pro=1.0;
		        for (Text val : values) {
		            String[] words=val.toString().split(",");
		         
			        System.out.println("打印的words的值====");
			        System.out.println("============="+words[0]+"===============");
			        System.out.println("=======reduce从这里结束======");
			    	System.out.println("=======reduce从这里结束======");
		            
		            for(int i=0;i<words.length;i++){
//		                String condi=type+":"+words[i];
		            	String aaa="1"+":"+words[i];
		                String bbb="0"+":"+words[i];
		                double pro1 = 0;
		                double pro2 = 0;
		                if(mapDouble.get(aaa)!=null){//如果该单词出现在该类别中,说明有条件概率
		                     pro1=pro*mapDouble.get(aaa);
		                    System.out.println("打印的mapDouble.get(condi)的值");
					        System.out.println("============="+mapDouble.get(aaa)+"===============");
//					        System.out.println("打印的mapDouble.get(condi)的值");
					    	
		                	
		                }else if(mapDouble.get(bbb)!=null){
		                	//如果该单词不在该类别中,就采用默认的条件概率
		                	 pro2=pro*mapDouble.get(bbb);
		                }else{
		                	pro=pro*noFind.get(type);
		                }
//		                String aaa="1"+":"+words[i];
//		                String bbb="0"+":"+words[i];
//		                double pro1 = pro*prePro.get(aaa);
//				        double pro2 = pro*prePro.get(bbb);
//				        System.out.println(pro1);
//				        System.out.println(pro2);
				        System.out.println("=======reducer======");
				        if(pro1>pro2){
				        	context1.write(new Text("0"), new Text(words[i]));
				        }else
				        {
				        	context1.write(new Text("1"), new Text(words[i]));
				        }
				        
//				        context1.write(new Text(type), new DoubleWritable(pro1));
		            }
		            
		        }
//		        System.out.println("=======reducer======");
		        
//		        double pro1 = pro*prePro.get("1");
//		        double pro2 = pro*prePro.get("0");
//		        System.out.println(pro1);
//		        System.out.println(pro2);
//		        System.out.println("=======reducer======");
//		        if(pro1<pro2){
//		        	context1.write(new Text(1), );
//		        }
//		        
//		        context1.write(new Text(type), new DoubleWritable(pro1));
		    }
	
		    	
		    
	}
	public static void main(String[] args) throws Exception {//预测
        Configuration conf = new Configuration();
//        String input="hdfs://10.107.8.110:9000/Bayes/Predict_input";
//        String output="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Predict";
//        String condiProPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Con";
//        String proPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Pro";
//        String countPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Count";
        
//        String input="hdfs://10.107.8.110:9000/Bayes/Bayes_input";
//        String output="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Con";
//        String proPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Pro";//这是之前求各个类别下单词数目的输出
//        String countPath="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Count";
//        String input="C:/fenlei.txt";
        String input="C:/naive/shiyan/shiyan.txt";
        String output="C:/naivedeooooo";
        String condiProPath="C:/abc.txt";
        String proPath="C:/naive/fenlei/a.txt";//这是之前求各个类别下单词数目的输出
        String countPath="C:/naive/tiaojian/b.txt";//这是之前求的单词种类数 */
        conf.set("condiProPath",condiProPath);
        conf.set("proPath",proPath);
        conf.set("countPath",countPath);
        Job job = Job.getInstance(conf, "Predict");
        job.setJarByClass(jieguo.class);
        // TODO: specify a mapper
        job.setMapperClass(jieguoMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // TODO: specify a reducer
        job.setReducerClass(jieguoReducer.class);

        // TODO: specify output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);

        // TODO: specify input and output DIRECTORIES (not files)
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        job.waitForCompletion(true);
            
    }

		
		
		

}

class Utils1{
	
	public static  Map<String,Integer> getMapFormHDFS(String input) throws Exception{
		Configuration conf = new Configuration();
		Path path = new Path(input);
		FileSystem fs = path.getFileSystem(conf);
		FileStatus[] status = fs.listStatus(path);
		Map<String,Integer> map = new HashMap();
		for(int i= 0;i < status.length;i++){
			if(status[i].isFile()){
				FSDataInputStream infs = fs.open(status[i].getPath());
				LineReader reader = new LineReader(infs,conf);
				Text line = new Text();
				while (reader.readLine(line) > 0){
					String[] temp = line.toString().split(",");
//					System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
//					System.out.println("取map里面的数据的数组");
//					System.out.println(temp);
//					System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
					map.put(temp[0].toString(), Integer.parseInt(temp[1]));
//					map.put(new String("1"), 10);
				}
				reader.close();
			}
		}
		
		
		return map;
	}
	
	public static   Map<String,Double> getMapFormHDFS(String input,boolean j) throws IOException{
        Configuration conf=new Configuration();
        Path path=new Path(input);
        FileSystem fs=path.getFileSystem(conf);
        
        FileStatus[] stats=fs.listStatus(path);
        Map<String,Double> map=new HashMap();
        for(int i=0;i<stats.length;i++){
            if(stats[i].isFile()){
                FSDataInputStream infs=fs.open(stats[i].getPath());
                LineReader reader=new LineReader(infs,conf);
                Text line=new Text();
                while(reader.readLine(line)>0){
                    String[] temp1=line.toString().split(",");
                    String[] temp = temp1[1].toString().split(" ");
                   // System.out.println(temp1.length);
                    //System.out.println(temp.length);
//                    System.out.println("111111111111111111111111111111111111111111111111111111111111111111111");
//                    System.out.println(temp[1]);
                    String mapget = temp1[0]+":"+temp[0];
                    map.put(mapget,Double.parseDouble(temp[1]));
//                    System.out.println(map);
//                    System.out.println("111111111111111111111111111111111111111111111111111111111111111111111");
                }
                reader.close();
            }
        }
        
        return map;
    
    }
	
	public static  int getCountFromHDFS(String input) throws IOException{
        Configuration conf=new Configuration();
        Path path=new Path(input);
        FileSystem fs=path.getFileSystem(conf);
        
        FileStatus[] stats=fs.listStatus(path);
        
        int count=0;
        for(int i=0;i<stats.length;i++){
            if(stats[i].isFile()){
                FSDataInputStream infs=fs.open(stats[i].getPath());
                LineReader reader=new LineReader(infs,conf);
                Text line=new Text();
                while(reader.readLine(line)>0){
                    String[] temp=line.toString().split(",");
//                    System.out.println(temp[0]);
//                    String[] temp1 = temp.toString().split(" ");
//                    System.out.println("=*****************=");
//					System.out.println("取count里面的数据的数组");
//					for(int n = 0;n < temp.length ; n++){
//						System.out.println(temp[0]);
//					}
					
//                    System.out.println(temp1[1]);
                    count=Integer.parseInt(temp[1]);
//                    System.out.println(temp[1]);
//					count = Integer.valueOf(temp1[1]).intValue();
//                    System.out.println("=****************=");
                    //count = 20;
                }
                reader.close();
            }
        }
        return count;
    }
	
	
}

第三部分

package naive;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class jisuan {
	
	public static class jisuanMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String[] line = value.toString().split(":" );
			//处理成2部分,标签  单词
			String[] line1 = line[1].split(",");
			for(int i=0; i < line1.length;i++){
				String key1 = line[0] + ":" +line1[i];
				System.out.println("===============================");
				System.out.println(line1.length+"--"+line[0] + ":" +line1[i]);
				System.out.println("===============================");
				context.write(new Text(key1), new IntWritable(1));
				
			}
		}
	}
	
	public static class jisuanReducer extends Reducer<Text, IntWritable, Text, DoubleWritable>{
		public Map<String,Integer> map ;
		public int count;
//		System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			Configuration conf = context.getConfiguration();
			String proPath = conf.get("propath");
			System.out.println(proPath);
			String countPath = conf.get("countPath");
			System.out.println(countPath);
			try {
				map = Utils.getMapFormHDFS(proPath);
			} catch (Exception e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
//			try {
//				map = Utils.getMapFormHDFS(proPath);
//				System.out.println(map);
				//获取各个类别下的单词数
//			} catch (Exception e) {
				System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
				System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
				System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
//				
//			}
//			System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
//			System.out.println(" ");
//			System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
			count = Utils.getCountFromHDFS(countPath);
//			System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
//			System.out.println(count);
//			System.out.println("aaaaaaaaaaaaaaaaaaaaaaaaa");
			//获取单词种类数
		}
		
		protected void  reduce(Text _key, Iterable<IntWritable> values,Context context)
						throws IOException, InterruptedException {
			//从map那里得到的是 <标签:单词   1>
			int sum = 0;
//			System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
			for(IntWritable val :values){
				sum += val.get();
				//<标签:单词 单词个数>
			}
			int type = Integer.parseInt(_key.toString().split(":")[0]);
			//获取每个类别标签
			double probability = 0.0;
			
			for(Map.Entry<String, Integer> entry:map.entrySet()){
				//这个方法是去每个对应标签下面的每个单词
				//map是每个标签下面的单词数
				if(type == Integer.parseInt(entry.getKey())){
					probability=(sum+1)*1.0/(entry.getValue()+count);
					//计算的条件概率
				}
			}
//			System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
//			System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
//			System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
			context.write(_key, new DoubleWritable(probability));
			
		}
		
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
        String input="C:/fenlei.txt";
        String output="C:/naiveaaaa";
        String proPath="C:/naive/fenlei/a.txt";//这是之前求各个类别下单词数目的输出
        String countPath="C:/naive/tiaojian/b.txt";//这是之前求的单词种类数
        conf.set("propath",proPath);
        conf.set("countPath",countPath);
        Job job = Job.getInstance(conf, "ConditionPro");
        
        job.setJarByClass(jisuan.class);
 
        job.setMapperClass(jisuanMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
  
        job.setReducerClass(jisuanReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

       
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        job.waitForCompletion(true);
            
		
	}
	

}



class Utils{
	
	public static Map<String,Integer> getMapFormHDFS(String input) throws Exception{
		Configuration conf = new Configuration();
		Path path = new Path(input);
		FileSystem fs = path.getFileSystem(conf);
		FileStatus[] status = fs.listStatus(path);
		Map<String,Integer> map = new HashMap();
		for(int i= 0;i < status.length;i++){
			if(status[i].isFile()){
				FSDataInputStream infs = fs.open(status[i].getPath());
				LineReader reader = new LineReader(infs,conf);
				Text line = new Text();
				while (reader.readLine(line) > 0){
					String[] temp = line.toString().split(",");
					System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
					System.out.println("取map里面的数据的数组");
					System.out.println(temp);
					System.out.println("bbbbbbbbbbbbbbbbbbbbbbbbb");
					map.put(temp[0].toString(), Integer.parseInt(temp[1]));
//					map.put(new String("1"), 10);
				}
				reader.close();
			}
		}
		
		
		return map;
	}
	
	public  static Map<String,Double> getMapFormHDFS(String input,boolean j) throws IOException{
        Configuration conf=new Configuration();
        Path path=new Path(input);
        FileSystem fs=path.getFileSystem(conf);
        
        FileStatus[] stats=fs.listStatus(path);
        Map<String,Double> map=new HashMap();
        for(int i=0;i<stats.length;i++){
            if(stats[i].isFile()){
                FSDataInputStream infs=fs.open(stats[i].getPath());
                LineReader reader=new LineReader(infs,conf);
                Text line=new Text();
                while(reader.readLine(line)>0){
                    String[] temp=line.toString().split(",");
                    //System.out.println(temp.length);
                    map.put(temp[0],Double.parseDouble(temp[1]));
                }
                reader.close();
            }
        }
        
        return map;
    
    }
	
	public static int getCountFromHDFS(String input) throws IOException{
        Configuration conf=new Configuration();
        Path path=new Path(input);
        FileSystem fs=path.getFileSystem(conf);
        
        FileStatus[] stats=fs.listStatus(path);
        
        int count=0;
        for(int i=0;i<stats.length;i++){
            if(stats[i].isFile()){
                FSDataInputStream infs=fs.open(stats[i].getPath());
                LineReader reader=new LineReader(infs,conf);
                Text line=new Text();
                while(reader.readLine(line)>0){
                    String[] temp=line.toString().split(",");
//                    System.out.println(temp[0]);
//                    String[] temp1 = temp.toString().split(" ");
                    System.out.println("=*****************=");
					System.out.println("取count里面的数据的数组");
//					for(int n = 0;n < temp.length ; n++){
//						System.out.println(temp[0]);
//					}
					
//                    System.out.println(temp1[1]);
                    count=Integer.parseInt(temp[1]);
                    System.out.println(temp[1]);
//					count = Integer.valueOf(temp1[1]).intValue();
                    System.out.println("=****************=");
                    //count = 20;
                }
                reader.close();
            }
        }
        return count;
    }
	
	
}
第四部分

package naive;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class tiaojian {
	
	public static class tiaojianMapper extends Mapper<LongWritable, Text,Text, Text>{
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			String[] line = value.toString().split(":");
			String[] line1 = line[1].split(",");
			String key1 = "1";
			for(int i = 1; i < line1.length; i++){
				context.write(new Text(key1), new Text(line1[i]));
			}
		}
	}
	
	
	public static class tiaojianCombine extends Reducer<Text, Text, Text, Text>{
		@Override
		protected void reduce(Text _key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			Set set = new HashSet();
			for(Text val :values){
				set.add(val.toString());
			}
			for(Iterator it = set.iterator();it.hasNext();){
				context.write(new Text("1"), new Text(it.next().toString()));
			}
			
		}
	}
	
	
	public static class tiaojianReducer extends Reducer<Text, Text, Text, Text>{
		@Override
		protected void reduce(Text _key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			Set set = new HashSet();
			for(Text val:values){
				set.add(val.toString());
			}
			context.write(new Text("num is"), new Text(String.valueOf(set.size())));
		}
	}
	
	public static void main(String[] args) throws Exception, Exception {
		Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "Count");
//        String input="hdfs://10.107.8.110:9000/Bayes/Bayes_input";
//        String output="hdfs://10.107.8.110:9000/Bayes/Bayes_output/Count";
        job.setJarByClass(tiaojian.class);
      
        job.setMapperClass(tiaojianMapper.class);
        job.setCombinerClass(tiaojianCombine.class);
        job.setReducerClass(tiaojianReducer.class);
        
       
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

      
        FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
		
	}
	
	

}




  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值