朴素贝叶斯分类并行化算法

最新推荐文章于 2022-11-30 15:13:50 发布

爽朗的琪琪

最新推荐文章于 2022-11-30 15:13:50 发布

阅读量1.9k

点赞数

分类专栏：大数据文章标签：贝叶斯

大数据专栏收录该内容

3 篇文章 0 订阅

订阅专栏

NaiveBayesMain.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

public class NaiveBayesMain
{
	public static void main(String[] args) throws Exception
	{
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		FileSystem fs = FileSystem.get(conf);
		Path path_train, path_temp, path_test, path_out;
		if(otherArgs.length != 5)
		{
			System.err.println("Usage: NaiveBayesMain <dfs_path> <conf> <train> <test> <out>");
			System.exit(2);
		}

		conf.set("conf", otherArgs[0] + "/" +otherArgs[1]);
		conf.set("train", otherArgs[0] + "/" +otherArgs[2]);
		conf.set("test", otherArgs[0] + "/" +otherArgs[3]);
		conf.set("output", otherArgs[0] + "/" +otherArgs[4]);
		
    	put2HDFS(otherArgs[1], otherArgs[0] + "/" + otherArgs[1], conf);
    	put2HDFS(otherArgs[2], otherArgs[0] + "/" + otherArgs[2], conf);
    	put2HDFS(otherArgs[3], otherArgs[0] + "/" + otherArgs[3], conf);
		
		path_train = new Path(otherArgs[0] + "/" + otherArgs[2]);
    	path_temp = new Path(otherArgs[0] + "/" + otherArgs[2] + ".train");
    	path_test = new Path(otherArgs[0] + "/" +otherArgs[3]);
    	path_out = new Path(otherArgs[0] + "/" + otherArgs[4]);
    	
		{
		Job job_train = new Job(conf, "naive bayse training");
		job_train.setJarByClass(NaiveBayesMain.class);
		job_train.setMapperClass(NaiveBayesTrain.TrainMapper.class);
		job_train.setCombinerClass(NaiveBayesTrain.TrainReducer.class);
		job_train.setReducerClass(NaiveBayesTrain.TrainReducer.class);
		job_train.setOutputKeyClass(Text.class);
    	job_train.setOutputValueClass(IntWritable.class);
     	
    	FileInputFormat.setInputPaths(job_train, path_train);
    	if(fs.exists(path_temp))
    		fs.delete(path_temp, true);
    	FileOutputFormat.setOutputPath(job_train, path_temp);
    	if(job_train.waitForCompletion(true) == false)
    		System.exit(1);
    		
    	conf.set("train_result", otherArgs[0] + "/" +otherArgs[2] + ".train");
    	}
    	{
    	Job job_test = new Job(conf, "naive bayse testing");
    	job_test.setJarByClass(NaiveBayesTest.class);
    	job_test.setMapperClass(NaiveBayesTest.TestMapper.class);
    	job_test.setOutputKeyClass(Text.class);
    	job_test.setOutputValueClass(Text.class);
    	
    	FileInputFormat.setInputPaths(job_test, path_test);
    	if(fs.exists(path_out))
    		fs.delete(path_out, true);
    	FileOutputFormat.setOutputPath(job_test, path_out);
    	if(job_test.waitForCompletion(true) == false)
    		System.exit(1);
    	fs.delete(path_temp, true);
    	}
    	
    	getFromHDFS(otherArgs[0] + "/" + otherArgs[4], ".", conf);
    	
    	fs.close();
    	System.exit(0);
	}
	
	
	public static void put2HDFS(String src, String dst, Configuration conf) throws Exception
	{
		Path dstPath = new Path(dst);
		FileSystem hdfs = dstPath.getFileSystem(conf);
		
		hdfs.copyFromLocalFile(false, true, new Path(src), new Path(dst));
		
	}
	
	public static void getFromHDFS(String src, String dst, Configuration conf) throws Exception
	{
		Path dstPath = new Path(dst);
		FileSystem lfs = dstPath.getFileSystem(conf);
		String temp[] = src.split("/");
		Path ptemp = new Path(temp[temp.length-1]);
		if(lfs.exists(ptemp));
			lfs.delete(ptemp, true);
		lfs.copyToLocalFile(true, new Path(src), dstPath);
		
	}
}

NaiveBayesTrain.java

import java.util.Scanner;
import java.io.IOException;
import java.util.ArrayList;


import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

public class NaiveBayesTrain
{
	public static class TrainMapper
		extends Mapper<Object, Text, Text, IntWritable>
	{
		public NaiveBayesConf nBConf;
		private final static IntWritable one = new IntWritable(1);
		private Text word;
		
		public void setup(Context context) 
		{
			try{
			nBConf = new NaiveBayesConf();
			Configuration conf = context.getConfiguration();
			nBConf.ReadNaiveBayesConf(conf.get("conf"), conf);
			}
			catch(Exception ex)
			{
				ex.printStackTrace();
				System.exit(1);
			}
			System.out.println("setup");
		}
		public void map(Object key, Text value, Context context)
			throws IOException, InterruptedException 
		{
			Scanner scan = new Scanner(value.toString());
			String str, vals[], temp;
			int i;
			word = new Text();
			while(scan.hasNextLine())
			{
				str = scan.nextLine();
				vals = str.split(" ");
				word.set(vals[0]);
				context.write(word, one);
				for(i = 1; i<vals.length; i++)
				{
					word = new Text();
					temp = vals[0] + "#" + nBConf.proNames.get(i-1);
					temp += "#" + vals[i];
					word.set(temp);					
					context.write(word, one);
				}
			}
		}
	}
	
	public static class TrainReducer
		extends Reducer<Text,IntWritable,Text,IntWritable>
	{
		private IntWritable result = new IntWritable();
		public void reduce(Text key, Iterable<IntWritable> values, 
        	Context context) throws IOException, InterruptedException 
		{
			int sum = 0;
			for (IntWritable val : values) 
			{
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);
        }
	}
}

NaiveBayesTest.java

import java.util.Scanner;
import java.io.IOException;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

public class NaiveBayesTest
{
	public static class TestMapper 
		extends Mapper<Object, Text, Text, Text>
	{
		public NaiveBayesConf nBConf;
		public NaiveBayesTrainData nBTData;
		public void setup(Context context)
		{			
			try{
			Configuration conf = context.getConfiguration();
			
			nBConf = new NaiveBayesConf();
			nBConf.ReadNaiveBayesConf(conf.get("conf"), conf);
			nBTData = new NaiveBayesTrainData();
			nBTData.getData(conf.get("train_result"), conf);
			}
			catch(Exception ex)
			{
				ex.printStackTrace();
				System.exit(1);
			}
		}
		
		public void map(Object key, Text value, Context context)
			throws IOException, InterruptedException 
		{
			Scanner scan = new Scanner(value.toString());
			String str, vals[], temp;
			int i,j,k,fxyi,fyi,fyij,maxf,idx;
			Text id;
			Text cls;
			
			while(scan.hasNextLine())
			{
				str = scan.nextLine();
				vals = str.split(" ");
				maxf = -100;
				idx = -1;
				for(i = 0; i<nBConf.class_num; i++)
				{
					fxyi = 1;
					String cl = nBConf.classNames.get(i);
					Integer integer = nBTData.freq.get(cl);
					if(integer == null)
						fyi = 0;
					else
						fyi = integer.intValue();
					for(j = 1; j<vals.length; j++)
					{
						temp = cl + "#" + nBConf.proNames.get(j-1) + "#" + vals[j];
						
						integer = nBTData.freq.get(temp);
						if(integer == null)
							fyij = 0;
						else
							fyij = integer.intValue();
						fxyi = fxyi*fyij;
					}
					if(fyi*fxyi > maxf)
					{
						maxf = fyi*fxyi;
						idx = i;
					}
				}
				id = new Text(vals[0]);
				cls = new Text(nBConf.classNames.get(idx));
				context.write(id, cls);
			}
		}
	}
}

NaiveBayesConf.java

import java.util.ArrayList;
import java.util.Scanner;
import java.io.File; 
import java.io.FileNotFoundException;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;

public class NaiveBayesConf
{
	public int dimen;
	public int class_num;
	public ArrayList<String> classNames;
	public ArrayList<String> proNames;
	public ArrayList<Integer>	proRanges;
	
	public NaiveBayesConf()
	{
		dimen = class_num = 0;
		classNames = new ArrayList<String>();
		proNames = new ArrayList<String>();
		proRanges = new ArrayList<Integer>();
	}
	
	public void ReadNaiveBayesConf(String file, Configuration conf) throws Exception
	{	
		Path conf_path = new Path(file);
		FileSystem hdfs = conf_path.getFileSystem(conf);
		FSDataInputStream fsdt = hdfs.open(conf_path);
		Scanner scan = new Scanner(fsdt);
		String str = scan.nextLine();
		String[] vals = str.split(" ");
		
		class_num = Integer.parseInt(vals[0]);
		
		int i;
		
		for(i = 1; i<vals.length; i++)
		{
			classNames.add(vals[i]);
		}
		
		str = scan.nextLine();
		vals = str.split(" ");
		dimen = Integer.parseInt(vals[0]);
		
		for(i = 1; i<vals.length; i+=2)
		{
			proNames.add(vals[i]);
			proRanges.add(new Integer(vals[i+1]));
		}
		fsdt.close();
		scan.close();
	}
}

NaiveBayesTrainData.java

import java.util.ArrayList;
import java.util.Scanner;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.File;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.util.HashMap;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
//功能：读取指定路径下输出文件"part-..."并添加到HashMap
public class NaiveBayesTrainData
{
	public HashMap<String, Integer> freq;

	public NaiveBayesTrainData()
	{
		freq = new HashMap<String, Integer>();
	}

	public void getData(String file, Configuration conf) throws IOException
	{
		int i;
		Path data_path = new Path(file);
		Path file_path;
		String temp[], line;
		FileSystem hdfs = data_path.getFileSystem(conf);
		
		FileStatus[] status = hdfs.listStatus(data_path);
		
		for(i = 0; i<status.length; i++)
		{
			file_path = status[i].getPath();			
			if(hdfs.getFileStatus(file_path).isDir() == true)
				continue;
			line = file_path.toString();
			temp = line.split("/");
			if(temp[temp.length-1].substring(0,5).equals("part-") == false)
				continue;
			System.err.println(line);
			FSDataInputStream fin = hdfs.open(file_path);
			InputStreamReader inr = new InputStreamReader(fin);
			BufferedReader bfr = new BufferedReader(inr);
			while((line = bfr.readLine()) != null)
			{	
				String res[] = line.split("\t");
				freq.put(res[0], new Integer(res[1]));
				System.out.println(line);
			}
			bfr.close();
			inr.close();
			fin.close();
		}
	}
	
}

NBayes.conf

4 cl1 cl2 cl3 cl4
3 p1 12 p2 16 p3 17

NBayes.train

NBayes.test

train-map-output.txt

trainmapinput:cl1 5 6 7
trainmapoutput:<cl1,1>
trainmapoutput:<cl1#p1#5,1>
trainmapoutput:<cl1#p2#6,1>
trainmapoutput:<cl1#p3#7,1>
trainmapinput:cl2 3 8 4
trainmapoutput:<cl2,1>
trainmapoutput:<cl2#p1#3,1>
trainmapoutput:<cl2#p2#8,1>
trainmapoutput:<cl2#p3#4,1>
trainmapinput:cl1 2 5 2
trainmapoutput:<cl1,1>
trainmapoutput:<cl1#p1#2,1>
trainmapoutput:<cl1#p2#5,1>
trainmapoutput:<cl1#p3#2,1>
trainmapinput:cl3 7 8 7
trainmapoutput:<cl3,1>
trainmapoutput:<cl3#p1#7,1>
trainmapoutput:<cl3#p2#8,1>
trainmapoutput:<cl3#p3#7,1>
trainmapinput:cl4 3 8 2
trainmapoutput:<cl4,1>
trainmapoutput:<cl4#p1#3,1>
trainmapoutput:<cl4#p2#8,1>
trainmapoutput:<cl4#p3#2,1>
trainmapinput:cl4 9 2 7
trainmapoutput:<cl4,1>
trainmapoutput:<cl4#p1#9,1>
trainmapoutput:<cl4#p2#2,1>
trainmapoutput:<cl4#p3#7,1>
trainmapinput:cl2 1 8 5
trainmapoutput:<cl2,1>
trainmapoutput:<cl2#p1#1,1>
trainmapoutput:<cl2#p2#8,1>
trainmapoutput:<cl2#p3#5,1>
trainmapinput:cl5 2 9 4
trainmapoutput:<cl5,1>
trainmapoutput:<cl5#p1#2,1>
trainmapoutput:<cl5#p2#9,1>
trainmapoutput:<cl5#p3#4,1>
trainmapinput:cl3 10 3 4
trainmapoutput:<cl3,1>
trainmapoutput:<cl3#p1#10,1>
trainmapoutput:<cl3#p2#3,1>
trainmapoutput:<cl3#p3#4,1>
trainmapinput:cl1 4 5 6
trainmapoutput:<cl1,1>
trainmapoutput:<cl1#p1#4,1>
trainmapoutput:<cl1#p2#5,1>
trainmapoutput:<cl1#p3#6,1>
trainmapinput:cl3 4 6 7
trainmapoutput:<cl3,1>
trainmapoutput:<cl3#p1#4,1>
trainmapoutput:<cl3#p2#6,1>
trainmapoutput:<cl3#p3#7,1>

train-reduce-output.txt

cl1	3
cl1#p1#2	1
cl1#p1#4	1
cl1#p1#5	1
cl1#p2#5	2
cl1#p2#6	1
cl1#p3#2	1
cl1#p3#6	1
cl1#p3#7	1
cl2	2
cl2#p1#1	1
cl2#p1#3	1
cl2#p2#8	2
cl2#p3#4	1
cl2#p3#5	1
cl3	3
cl3#p1#10	1
cl3#p1#4	1
cl3#p1#7	1
cl3#p2#3	1
cl3#p2#6	1
cl3#p2#8	1
cl3#p3#4	1
cl3#p3#7	2
cl4	2
cl4#p1#3	1
cl4#p1#9	1
cl4#p2#2	1
cl4#p2#8	1
cl4#p3#2	1
cl4#p3#7	1
cl5	1
cl5#p1#2	1
cl5#p2#9	1
cl5#p3#4	1

test-mapreduce-output.txt

1	cl1
10	cl4
11	cl3
2	cl2
3	cl1
4	cl3
5	cl1
6	cl2
7	cl1
8	cl3
9	cl4

爽朗的琪琪

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
朴素贝叶斯分类并行化算法

NaiveBayesMain.javaimport org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.util.GenericOptionsParser;import
复制链接

扫一扫

专栏目录