一、centos7系统上部署mapreduce的eclipse开发环境
1.linux下安装eclipse
>>官网下载eclipse-jee-oxygen-R-linux-gtk-x86_64.tar.gz
>>下载eclipse的hadoop2.2.0的插件jar包,复制到/usr/local/eclipse/plugin中,
如果水平高的,可以自己现在源码编译
>>打开eclipse的安装目录,运行 ./eclipse 即可
2.参数配置
按照上述步骤运行eclipse后,打开windows>> show perspective>>mapreduce
视图中出现小象的标志,点击新建,
弹出参数窗口mr(v2)配置为192.168.10.200,端口设置为在yarn-site.xml中resourcemanager.scheduler.adress的端口
dfs的参数设置为core-site.xml中的fs.defaultFS的IP和端口
如果mapreduce的端口配错了,advanced parameters的参数界面出不来
点击进入advanced parameters参数配置
fs.replication 2
fs.namenode.name.dir
fs.datanode.data.dir
resourcemanager.adress
这四个参数是我在配置hadoop文件的时候用到过的,也是现在的参数列表里有的,
因此都要设置成配置文件的值,其余都用默认值
注意:其他的所有的参数都不用变,但是自己配过的参数一定要在此使配置相同
3.退出,格式化hdfs,启动hadoop,启动eclipse
4.连接完成后就可以新建项目
注意:不是新建java项目,而是mapreduce项目,新建完成后进行测试
>>官网下载eclipse-jee-oxygen-R-linux-gtk-x86_64.tar.gz
>>下载eclipse的hadoop2.2.0的插件jar包,复制到/usr/local/eclipse/plugin中,
如果水平高的,可以自己现在源码编译
>>打开eclipse的安装目录,运行 ./eclipse 即可
2.参数配置
按照上述步骤运行eclipse后,打开windows>> show perspective>>mapreduce
视图中出现小象的标志,点击新建,
弹出参数窗口mr(v2)配置为192.168.10.200,端口设置为在yarn-site.xml中resourcemanager.scheduler.adress的端口
dfs的参数设置为core-site.xml中的fs.defaultFS的IP和端口
如果mapreduce的端口配错了,advanced parameters的参数界面出不来
点击进入advanced parameters参数配置
fs.replication 2
fs.namenode.name.dir
fs.datanode.data.dir
resourcemanager.adress
这四个参数是我在配置hadoop文件的时候用到过的,也是现在的参数列表里有的,
因此都要设置成配置文件的值,其余都用默认值
注意:其他的所有的参数都不用变,但是自己配过的参数一定要在此使配置相同
3.退出,格式化hdfs,启动hadoop,启动eclipse
4.连接完成后就可以新建项目
注意:不是新建java项目,而是mapreduce项目,新建完成后进行测试
二、 倒排索引程序测试
新建Map、Combine、Reduce以及ReduceDriver程序,源码如下
Map的源码:
package MapClass;
package MapClass;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.util.StringTokenizer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class STMap extends Mapper<LongWritable, Text, Text, Text> {
private static final Text one=new Text("1");
private Text key=new Text();
private Text value=new Text();
private static final Text one=new Text("1");
private Text key=new Text();
private Text value=new Text();
public void map(LongWritable ikey, Text ivalue, Context context)
throws IOException, InterruptedException {
// FileSplit file=(FileSplit)context.getInputSplit();
// String filename=file.getPath().getName();
String filename=context.getInputSplit().toString();
String line=ivalue.toString();
StringTokenizer cutter= new StringTokenizer(line);
while(cutter.hasMoreTokens()) {
key.set(filename+"|"+cutter.nextToken());
value.set(one);
context.write(key, value);
}
}
}
throws IOException, InterruptedException {
// FileSplit file=(FileSplit)context.getInputSplit();
// String filename=file.getPath().getName();
String filename=context.getInputSplit().toString();
String line=ivalue.toString();
StringTokenizer cutter= new StringTokenizer(line);
while(cutter.hasMoreTokens()) {
key.set(filename+"|"+cutter.nextToken());
value.set(one);
context.write(key, value);
}
}
}
Combine的源码:
package RuducerClass;
package RuducerClass;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class STCombiner extends Reducer<Text, Text, Text, Text> {
private Text key=new Text();
private Text value=new Text();
public void reduce(Text _key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int sum=0;
for (Text val : values) {
sum+=Integer.parseInt(val.toString());
}
String[] filename=_key.toString().split("\\|");
key.set(filename[1]);
value.set(filename[0]+":"+String.valueOf(sum));
context.write(key, value);
}
}
private Text key=new Text();
private Text value=new Text();
public void reduce(Text _key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int sum=0;
for (Text val : values) {
sum+=Integer.parseInt(val.toString());
}
String[] filename=_key.toString().split("\\|");
key.set(filename[1]);
value.set(filename[0]+":"+String.valueOf(sum));
context.write(key, value);
}
}
Reduce源码:
package RuducerClass;
package RuducerClass;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class STReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text _key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String name="";
for (Text val : values) {name=name+" "+val.toString()+"&";}
context.write(_key, new Text(name));
}
}
public void reduce(Text _key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String name="";
for (Text val : values) {name=name+" "+val.toString()+"&";}
context.write(_key, new Text(name));
}
}
主程序源码:
package MRDriver;
package MRDriver;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class STDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.10.200:9000");
conf.set("yarn.resourcemanager.scheduler.address","hdfs://192.168.10.200:8030");
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(MRDriver.STDriver.class);
job.setMapperClass(MapClass.STMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setCombinerClass(RuducerClass.STCombiner.class);
job.setReducerClass(RuducerClass.STReducer.class);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.10.200:9000");
conf.set("yarn.resourcemanager.scheduler.address","hdfs://192.168.10.200:8030");
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(MRDriver.STDriver.class);
job.setMapperClass(MapClass.STMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setCombinerClass(RuducerClass.STCombiner.class);
job.setReducerClass(RuducerClass.STReducer.class);
// TODO: specify output types
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
/*在调试程序的时候,由于每次都要删除上一次跑完的输出文件夹/st,所以在程序运行时先删除该目录*/
Path out=new Path("/output/st");
FileSystem hdfs=FileSystem.get(conf);
// 也可以使用 FileSystem hdfs=out.getFileSystem(conf);
try {
hdfs.delete(out,true);
}catch(Exception e) {e.getStackTrace();}
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path("/input/st/"));
System.out.println("---files found----");
FileOutputFormat.setOutputPath(job, new Path("/output/st"));
System.out.println("---output found----");
if (!job.waitForCompletion(true))
return;
}
}
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
/*在调试程序的时候,由于每次都要删除上一次跑完的输出文件夹/st,所以在程序运行时先删除该目录*/
Path out=new Path("/output/st");
FileSystem hdfs=FileSystem.get(conf);
// 也可以使用 FileSystem hdfs=out.getFileSystem(conf);
try {
hdfs.delete(out,true);
}catch(Exception e) {e.getStackTrace();}
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, new Path("/input/st/"));
System.out.println("---files found----");
FileOutputFormat.setOutputPath(job, new Path("/output/st"));
System.out.println("---output found----");
if (!job.waitForCompletion(true))
return;
}
}
在/input/st/下的三个输入文件名和内容为:
file01:i love mapreduce
file02:i like hadoop
file03:i do not like hadoop and mapreduce neither
输出的结果文件内容为:
/*结构:单词 路径+词频+&*/
and hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
do hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
hadoop hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
hdfs://192.168.10.200:9000/input/st/file02:0+14:1&
i hdfs://192.168.10.200:9000/input/st/file02:0+14:1&
hdfs://192.168.10.200:9000/input/st/file01:0+17:1&
hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
like hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
hdfs://192.168.10.200:9000/input/st/file02:0+14:1&
love hdfs://192.168.10.200:9000/input/st/file01:0+17:1&
mapreduce hdfs://192.168.10.200:9000/input/st/file01:0+17:1&
hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
neither hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
not hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
/*结构:单词 路径+词频+&*/
and hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
do hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
hadoop hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
hdfs://192.168.10.200:9000/input/st/file02:0+14:1&
i hdfs://192.168.10.200:9000/input/st/file02:0+14:1&
hdfs://192.168.10.200:9000/input/st/file01:0+17:1&
hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
like hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
hdfs://192.168.10.200:9000/input/st/file02:0+14:1&
love hdfs://192.168.10.200:9000/input/st/file01:0+17:1&
mapreduce hdfs://192.168.10.200:9000/input/st/file01:0+17:1&
hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
neither hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
not hdfs://192.168.10.200:9000/input/st/file03:0+43:1&
在调试过程主要遇到一下两个问题:
错误一:
源码:
Path out=new Path("/output/st");
FileSystem hdfs=FileSystem.get(conf);
// or FileSystem hdfs=out.getFileSystem(conf);
hdfs.delete(out,true);
报错无法识别文件系统,因此需要在之前让configuration读取配置文件core-site.xml中hdfs的信息
在前面增加一行:conf.set("fs.defaultFS", "hdfs://192.168.10.200:9000"),配置完成后指定的
FileSystem才是hdfs,否则就会默认使用linux的文件系统。
错误一:
源码:
Path out=new Path("/output/st");
FileSystem hdfs=FileSystem.get(conf);
// or FileSystem hdfs=out.getFileSystem(conf);
hdfs.delete(out,true);
报错无法识别文件系统,因此需要在之前让configuration读取配置文件core-site.xml中hdfs的信息
在前面增加一行:conf.set("fs.defaultFS", "hdfs://192.168.10.200:9000"),配置完成后指定的
FileSystem才是hdfs,否则就会默认使用linux的文件系统。
错误二:
源码:
// FileSplit file=(FileSplit)context.getInputSplit();
// String filename=file.getPath().getName();
String filename=context.getInputSplit().toString();
开始的两行是网上包括书上获取文件名称的方法,但是在跑程序的时候出现的问题是,一旦使用上述
两行代码,即使程序不报错,最后也没有结果文件输出,具体原因也没有搞清楚,但是在换用第三行
代码之后就能达到预期效果。