最新版本:2.5.1.
Hadoop的一个优秀的思想便是:代码向数据迁移,因为代码往往很小,数据却很大。所以代码迁就数据,很自然的想法。
下载2.5.1版本后,解压缩,进入文件夹,执行
root@idc66:/hadoop-2.5.1# ./bin/hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.1.jar wordcount
Usage: wordcount <in> [<in>...] <out>
root@idc66:/hadoop-2.5.1#
也就是说还缺少一些参数!
执行mkdir input
然后执行./bin/hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.1.jar wordcount input output
就可以看到输出,第一个Hadoop 2.5.1程序顺利执行完成!
root@idc66:/hadoop-2.5.1# cat output/part-r-00000
!!! 1
hello 1
world 1
--------------------------为了学习方便,对于刚开始的程序员,并不推荐一上来就使用分布式环境,
个人建议在本地单机学习使用Hadoop 2.5.1即可。
对于解压缩的文件,配置
编辑core-site.xml
./etc/hadoop/core-site.xml
<property>
<name>fs.default.name</name>
<value>hdfs://idc66:9000</value>
</property> <!--指定namenode的ip:port-->
-----------------------------------------------------------------------------------
关于HDFS:文件从本地上传到HDFS之后,被程序从HDFS文件系统中读取,操作完成后,又产生新的HDFS文件系统。-----------------------------------------------------------------------------------
我们开始写第一个MR程序。
先下载数据。
进入linux,启动Hadoop,
下载我们所需要的数据---
http://www.nber.org/patents/Cite75_99.zip
http://www.nber.org/patents/acite75_99.zip
解压缩之后即可。
启动hadoop后, 上传文件 ./bin/hadoop fs -put ./usa_data/apat63_99.txt
查看文件通过 ./bin/hadoop fs -lsr /user/root
然后编写代码如下:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyJob extends Configured implements Tool {
// public static class MapClass extends MapReduceBase implements
// Mapper<Text,Text,Text,Text>{
public static class MapClass extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String year = value.toString().split(",")[1];
context.write(new Text(year), new Text("1"));
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long count = 0;
for (Text val : values) {
count+=Long.parseLong(val.toString());
}
context.write(key, new Text(""+count));
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
// JobConf job = new JobConf(conf,MyJob.class);
Job job = new Job(conf, "MyJob");
job.setJarByClass(MyJob.class);
job.setJobName("MyJob");
Path in = new Path(args[0]);
FileInputFormat.setInputPaths(job, in);
job.setInputFormatClass(TextInputFormat.class);// 已经定义了k1,v1的格式
//conf.set(
//"mapreduce.input.keyvaluelinerecordreader.key.value.separator",
//",");
//conf.set("key.value.separator.in.input.line", ",");
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path out = new Path(args[1]);
FileOutputFormat.setOutputPath(job, out);
// JobClient.runJob(job);
System.exit(job.waitForCompletion(true) ? 0 : 1);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new MyJob(), args);
System.exit(res);
}
}
执行命令:
查看文件:
./bin/hadoop fs -lsr /user/rootlsr: DEPRECATED: Please use 'ls -R' instead.
drwxr-xr-x - root supergroup 0 2014-11-07 14:27 /user/root/apat63_99.rst
-rw-r--r-- 3 root supergroup 0 2014-11-07 14:27 /user/root/apat63_99.rst/_SUCCESS
-rw-r--r-- 3 root supergroup 9 2014-11-07 14:27 /user/root/apat63_99.rst/part-r-00000
-rw-r--r-- 3 root supergroup 236903179 2014-11-07 13:48 /user/root/apat63_99.txt
查看内容:
./bin/hadoop fs -tail /user/root/apat63_99.rst/part-r-00000
...
很简单的东西,没啥好说的。