jar包引用查看
https://blog.csdn.net/weixin_44393345/article/details/106337623
上传数据至hdfs,自定义就可以,测试
hdfs dfs -put 上传文件 /上传地址
代码
mappper读取数据不进行操作,直接向下传递
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @program: hdfs
* @description:
* @author: wenglei
* @create: 2020-05-26 09:31
**/
public class PartitionMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//System.out.println(value.toString());
context.write(value,NullWritable.get());
}
}
reduce读取数据不进行操作,直接向下传递,输出文件
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @program: hdfs
* @description:
* @author: wenglei
* @create: 2020-05-26 13:49
**/
public class PartitionReduce extends Reducer<Text, NullWritable,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
partition,数据进行分区, 分区个数对应reduce个数
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* @program: hdfs
* @description:
* @author: wenglei
* @create: 2020-05-26 13:43
**/
public class Partition extends Partitioner<Text,NullWritable> {
@Override
public int getPartition(Text text, NullWritable nullWritable, int i) {
//System.out.println(text.toString());
String[] split = text.toString().split("\t");
//for (int i1 = 0; i1 < split.length; i1++) {
// System.out.println(i1+" "+split[i1]);
//}
String s = split[2];
System.out.println(s);
if(s =="普通"){
System.out.println("普通");
return 0;
}else if(s =="VIP"){
System.out.println("VIP");
return 1;
}else if (s =="高级"){
return 2;
}else {
return 3;
}
}
}
程序运行的主程序
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* @program: hdfs
* @description:
* @author: wenglei
* @create: 2020-05-26 13:50
**/
public class PartitionMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),PartitionMain.class.getSimpleName());
job.setJarByClass(PartitionMain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/input/a.txt"));
job.setMapperClass(PartitionMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//设置分区
job.setPartitionerClass(Partition.class);
//设置分区数
job.setNumReduceTasks(4);
job.setReducerClass(PartitionReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
//文件输出地址
TextOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/output"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configured = new Configuration();
int run = ToolRunner.run(configured, new PartitionMain(), args);
System.exit(run);
}
}
集群运行
yarn jar hdfs-1.0-SNAPSHOT.jar com.wenglei.MR.partition.PartitionMain
问题
java.lang.Exception: java.io.IOException: Illegal partition for 1002 (1) mapreduce自定义partition分区报错
出错的原因:
1.pom文件版本问题,使用的并非是apache版本,只能在集群运行
2.partition默认返回值是从 0 开始,注意返回值是否有0.
3.reduceTask的数量要与分区的个数保持一致.
解决方法:
换一个pom文件或者上传到集群运行.