很多时候需要对大文件进行分区
最简单的是ID的hash分区
利用MapReduce的分区把文件分割成到不同的文件中去
方便后续的计算,例如KNN可以吧预测切分成多个小片
分别读入预测
package com.mr.partition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 把预测的数据读入内存然后进行迭代计算
* 适用于预测数据很少训练数据很多
* 如果预测数据很多可以切分多分分别计算
* @author lenovo
* 1,计算欧式距离(可根据实际情况修改距离公式)
* 2,找出最近
* 输出topk使用TreeSet<TopKeyWritable>自己写TopKeyWritable排序
*/
public class IDhashMR extends Configured implements Tool {
public static enum Counter {
PARSER_ERR
}
public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
private Text mykey = new Text();
private Text myval = new Text();
List testList=new ArrayList();
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] array = value.toString().split(",");
mykey.set(array[0]);
myval.set(array[1]);
context.write(mykey, myval);
};
}
public static class MyReduce extends Reducer<Text, Text, Text, Text> {
private Text val = new Text();
Map top=new TreeMap();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// 循环遍历 Interable
int sum = 0;
for (Text value : values) {
// 累加
String[] array = value.toString().split(",");
sum += Integer.parseInt(array[0]);
}
val.set(sum+"");
context.write(key, val);
};
}
@Override
public int run(String[] args) throws Exception {
// 1 conf
Configuration conf = new Configuration();
conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
// DistributedCache.addCacheFile(new Path(args[2]).toUri(), conf);// 为该job添加缓存文件
// 2 create job
// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
Job job = this.parseInputAndOutput(this, conf, args);
// 3 set job
// 3.1 set run jar class
// job.setJarByClass(ModuleReducer.class);
// 3.2 set intputformat
job.setInputFormatClass(TextInputFormat.class);
// 3.3 set input path
// FileInputFormat.addInputPath(job, new Path(args[0]));
// 3.4 set mapper
job.setMapperClass(MyMap.class);
// 3.5 set map output key/value class
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 3.6 set partitioner class
job.setPartitionerClass(IDhashPartition.class);
// 3.7 set reduce number
job.setNumReduceTasks(2);
// 3.8 set sort comparator class
// job.setSortComparatorClass(LongWritable.Comparator.class);
// 3.9 set group comparator class
// job.setGroupingComparatorClass(LongWritable.Comparator.class);
// 3.10 set combiner class
// job.setCombinerClass(null);
// 3.11 set reducer class
job.setReducerClass(MyReduce.class);
// 3.12 set output format
job.setOutputFormatClass(TextOutputFormat.class);
// 3.13 job output key/value class
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 3.14 set job output path
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 4 submit job
boolean isSuccess = job.waitForCompletion(true);
// 5 exit
// System.exit(isSuccess ? 0 : 1);
return isSuccess ? 0 : 1;
}
public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
throws Exception {
// validate
// if (args.length != 2) {
// System.err.printf("Usage:%s [genneric options]<input><output>\n",
// tool.getClass().getSimpleName());
// ToolRunner.printGenericCommandUsage(System.err);
// return null;
// }
// 2 create job
Job job = new Job(conf, tool.getClass().getSimpleName());
// 3.1 set run jar class
job.setJarByClass(tool.getClass());
// 3.3 set input path
FileInputFormat.addInputPath(job, new Path(args[0]));
// 3.14 set job output path
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job;
}
public static void main(String[] args) throws Exception {
args = new String[] {
"hdfs://192.168.192.129:9000/ml/knn/partitioner.txt",
// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
"hdfs://192.168.192.129:9000/ml/knn/partitioner/"};
// run mapreduce
int status = ToolRunner.run(new IDhashMR(), args);
// 5 exit
System.exit(status);
}
}
package com.mr.partition;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 自定义分区
* @author Administrator
*
*/
public class IDhashPartition extends Partitioner<Text,Text>{
// private static final Logger logger = LoggerFactory.getLogger(DefinedPartition.class);
/**
* 数据输入来源:map输出
* @author zengzhaozheng
* @param key map输出键值
* @param value map输出value值
* @param numPartitions 分区总数,即reduce task个数
*/
public int getPartition(Text key, Text value,int numPartitions) {
//("--------enter DefinedPartition flag--------");
/**
* 注意:这里采用默认的hash分区实现方法
* 根据组合键的第一个值作为分区
* 这里需要说明一下,如果不自定义分区的话,mapreduce框架会根据默认的hash分区方法,
* 将整个组合将相等的分到一个分区中,这样的话显然不是我们要的效果
*/
// ("--------out DefinedPartition flag--------");
// return (key.getFirstKey().hashCode()&Integer.MAX_VALUE)%numPartitions;
return (Integer.parseInt(key.toString()))%10;
}
}