背景:在centos7.5上装载hadoop-2.8.5集群。现在要在windows编写MR
搭建windows 的hadoop环境
-
- 下载与linux集群同个版本的hadoop
-
- 配置hadoop环境变量
- HADOOP_HOME:D:\hadoop-2.8.5
- HADOOP_BIN_PATH:%HADOOP_HOME%\bin
- HADOOP_PREFIX:%HADOOP_HOME%
- 在Path后面添加;%HADOOP_HOME%\bin;%HADOOP_HOME%\sbin;
- 解压winutils-master.zip,将文件添加到hadoop_home的bin目录下
- 查看本地hadoop版本
- 配置hadoop环境变量
编写M与R
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* WCMapper
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text keyOut = new Text();
IntWritable valueOut = new IntWritable();
String[] arr = value.toString().split(" ");
for(String s : arr){
keyOut.set(s);
valueOut.set(1);
context.write(keyOut,valueOut);
}
}
}
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Reducer */ public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ /** * reduce */ protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0 ; for(IntWritable iw : values){ count = count + iw.get() ; } String tno = Thread.currentThread().getName(); System.out.println(tno + " : WCReducer :" + key.toString() + "=" + count); context.write(key,new IntWritable(count)); } }
项目的pom.xml依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.8.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.8.5</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
添加hadoop版本文件夹下的share文件
编写job类
-
- 使用本地hdfs系统
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; /** * */ public class WCApp { public static void main(String[] args) throws Exception { //这个使用的是window的本地,没用到linux集群 Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); Job job = Job.getInstance(conf); //设置job的各种属性 job.setJobName("WCApp"); //作业名称 job.setJarByClass(WCApp.class); //搜索类 job.setInputFormatClass(TextInputFormat.class); //设置输入格式 //设置输出格式类 //job.setOutputFormatClass(SequenceFileOutputFormat.class); //添加输入路径 FileInputFormat.addInputPath(job,new Path(args[0])); //设置输出路径 FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置最大切片数 //FileInputFormat.setMaxInputSplitSize(job,13); //最小切片数 //FileInputFormat.setMinInputSplitSize(job,1L); //设置分区类 job.setPartitionerClass(MyPartitioner.class); //设置自定义分区 //设置合成类 job.setCombinerClass(WCReducer.class); //设置combiner类 job.setMapperClass(WCMapper.class); //mapper类 job.setReducerClass(WCReducer.class); //reducer类 job.setNumReduceTasks(3); //reduce个数 job.setMapOutputKeyClass(Text.class); // job.setMapOutputValueClass(IntWritable.class); // job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.waitForCompletion(true); } }
注意:这边的args[0] 与args[1]是运行参数
-
- 使用linux集群hdfs系统
- import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class RunJob { public static void main(String[] args){ Configuration conf=new Configuration(); try{ Job job = Job.getInstance(conf, "word count"); job.setJarByClass(RunJob.class); job.setMapperClass(MyMap.class); job.setReducerClass(MyReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //解决No job jar file set. User classes may not be found. See Job or Job#setJar(String)报错的问题 job.setJar("E:\\bigData\\workspace\\testMR\\out\\artifacts\\MyMR\\MyMR.jar"); FileInputFormat.addInputPath(job,new Path(args[0])); FileSystem fs= FileSystem.get(conf); Path op1=new Path(args[1]); if(fs.exists(op1)){ fs.delete(op1, true); System.out.println("存在此输出路径,已删除!!!"); } FileOutputFormat.setOutputPath(job,op1); System.exit(job.waitForCompletion(true)?0:1); }catch (Exception e){ e.printStackTrace(); } }
注意:这边需要导入core-site.xml文件,
变配置args[0]与args[1]的值
到hdfs页面查看结果:
-
- 可能出现的错:Exception in thread "main" java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
修改方法:下载与hadoop同个版本的resource文件,然后取出该文件夹下的hadoop-common-project\hadoop-common\src\main\java\org\apache\hadoop\io\nativeio的NativeIO.class文件,放到项目的指定包下:
然后修改access方法的返回值。
两种方法主要是输入源、输出地址的区别.