使用maven工具,创建简单的WordCount项目用hadoop map-reduce做分析
首先创建目录结构 src/main/java/com/xueyu/MyWordCount.java 存放JAVA源文件
然后创建maven配置文件pom .xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.xueyu</groupId>
<artifactId>myhadoop-wordcount</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>Maven Quick Start Archetype</name>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
</project>
然后用mvn package打包,在target目录下有myhadoop-wordcount-1.0-SNAPSHOT.jar
因为hadoop本身依赖很多其他jar包,运行时需要在classpath中指定,用maven命令mvn clean dependency:copy-dependencies package 自动将依赖jar包拷入target/dependency目录
然后创建lib目录,把myhadoop-wordcount-1.0-SNAPSHOT.jar和taget/dependency下的其他依赖都拷入lib下
用简单的脚本找出CLASSPATH
for file in ./lib/*.jar
do
CLASSPATH=$CLASSPATH:$file
done
echo $CLASSPATH
在我本地显示
/opt/jdk1.7.0_17/lib:/opt/jdk1.7.0_17/jre/lib:.:./lib/activation-1.1.jar:./lib/aopalliance-1.0.jar:./lib/asm-3.1.jar:./lib/avro-1.7.4.jar:./lib/commons-beanutils-1.7.0.jar:./lib/commons-beanutils-core-1.8.0.jar:./lib/commons-cli-1.2.jar:./lib/commons-codec-1.4.jar:./lib/commons-collections-3.2.1.jar:./lib/commons-compress-1.4.1.jar:./lib/commons-configuration-1.6.jar:./lib/commons-digester-1.8.jar:./lib/commons-httpclient-3.1.jar:./lib/commons-io-2.1.jar:./lib/commons-lang-2.5.jar:./lib/commons-logging-1.1.1.jar:./lib/commons-math-2.1.jar:./lib/commons-net-3.1.jar:./lib/gmbal-api-only-3.0.0-b023.jar:./lib/grizzly-framework-2.1.2.jar:./lib/grizzly-http-2.1.2.jar:./lib/grizzly-http-server-2.1.2.jar:./lib/grizzly-http-servlet-2.1.2.jar:./lib/grizzly-rcm-2.1.2.jar:./lib/guava-11.0.2.jar:./lib/guice-3.0.jar:./lib/hadoop-annotations-2.2.0.jar:./lib/hadoop-auth-2.2.0.jar:./lib/hadoop-client-2.2.0.jar:./lib/hadoop-common-2.2.0.jar:./lib/hadoop-hdfs-2.2.0.jar:./lib/hadoop-mapreduce-client-app-2.2.0.jar:./lib/hadoop-mapreduce-client-common-2.2.0.jar:./lib/hadoop-mapreduce-client-core-2.2.0.jar:./lib/hadoop-mapreduce-client-jobclient-2.2.0.jar:./lib/hadoop-mapreduce-client-shuffle-2.2.0.jar:./lib/hadoop-yarn-api-2.2.0.jar:./lib/hadoop-yarn-client-2.2.0.jar:./lib/hadoop-yarn-common-2.2.0.jar:./lib/hadoop-yarn-server-common-2.2.0.jar:./lib/jackson-core-asl-1.8.8.jar:./lib/jackson-jaxrs-1.8.3.jar:./lib/jackson-mapper-asl-1.8.8.jar:./lib/jackson-xc-1.8.3.jar:./lib/javax.inject-1.jar:./lib/javax.servlet-3.1.jar:./lib/javax.servlet-api-3.0.1.jar:./lib/jaxb-api-2.2.2.jar:./lib/jaxb-impl-2.2.3-1.jar:./lib/jersey-client-1.9.jar:./lib/jersey-core-1.9.jar:./lib/jersey-grizzly2-1.9.jar:./lib/jersey-guice-1.9.jar:./lib/jersey-json-1.9.jar:./lib/jersey-server-1.9.jar:./lib/jersey-test-framework-core-1.9.jar:./lib/jersey-test-framework-grizzly2-1.9.jar:./lib/jettison-1.1.jar:./lib/jetty-util-6.1.26.jar:./lib/jsr305-1.3.9.jar:./lib/log4j-1.2.17.jar:./lib/management-api-3.0.0-b012.jar:./lib/myhadoop-wordcount-1.0-SNAPSHOT.jar:./lib/paranamer-2.3.jar:./lib/protobuf-java-2.5.0.jar:./lib/slf4j-api-1.7.5.jar:./lib/slf4j-log4j12-1.7.5.jar:./lib/snappy-java-1.0.4.1.jar:./lib/stax-api-1.0.1.jar:./lib/xmlenc-0.52.jar:./lib/xz-1.0.jar:./lib/zookeeper-3.4.5.jar
设置环境变量
最后运行程序 java -cp $CLASSPATH com.xueyu.MyWordCount /var/log/syslog /tmp/temp/mavenhadoop/resulta
分析程序就是简单改造的Hadoop example中的WordCount,如下
package com.xueyu;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
//import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MyWordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException
{
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
String[] otherargs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "mywordcount");
//job.setJarByClass(MyWordCount.class);
job.setJar("/root/xueyu/myhadoop/tmp5/htry.jar");
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherargs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}