用maven管理hadoop开发项目

使用maven工具,创建简单的WordCount项目用hadoop map-reduce做分析


首先创建目录结构 src/main/java/com/xueyu/MyWordCount.java 存放JAVA源文件

然后创建maven配置文件pom .xml


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.xueyu</groupId>
  <artifactId>myhadoop-wordcount</artifactId>
  <version>1.0-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>Maven Quick Start Archetype</name>
  <url>http://maven.apache.org</url>

  <dependencies>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.2.0</version>
    </dependency>
  </dependencies>
</project>

然后用mvn package打包,在target目录下有myhadoop-wordcount-1.0-SNAPSHOT.jar

因为hadoop本身依赖很多其他jar包,运行时需要在classpath中指定,用maven命令mvn clean dependency:copy-dependencies package 自动将依赖jar包拷入target/dependency目录


然后创建lib目录,把myhadoop-wordcount-1.0-SNAPSHOT.jar和taget/dependency下的其他依赖都拷入lib下

用简单的脚本找出CLASSPATH

for file in ./lib/*.jar
do
  CLASSPATH=$CLASSPATH:$file
done

echo $CLASSPATH

在我本地显示

/opt/jdk1.7.0_17/lib:/opt/jdk1.7.0_17/jre/lib:.:./lib/activation-1.1.jar:./lib/aopalliance-1.0.jar:./lib/asm-3.1.jar:./lib/avro-1.7.4.jar:./lib/commons-beanutils-1.7.0.jar:./lib/commons-beanutils-core-1.8.0.jar:./lib/commons-cli-1.2.jar:./lib/commons-codec-1.4.jar:./lib/commons-collections-3.2.1.jar:./lib/commons-compress-1.4.1.jar:./lib/commons-configuration-1.6.jar:./lib/commons-digester-1.8.jar:./lib/commons-httpclient-3.1.jar:./lib/commons-io-2.1.jar:./lib/commons-lang-2.5.jar:./lib/commons-logging-1.1.1.jar:./lib/commons-math-2.1.jar:./lib/commons-net-3.1.jar:./lib/gmbal-api-only-3.0.0-b023.jar:./lib/grizzly-framework-2.1.2.jar:./lib/grizzly-http-2.1.2.jar:./lib/grizzly-http-server-2.1.2.jar:./lib/grizzly-http-servlet-2.1.2.jar:./lib/grizzly-rcm-2.1.2.jar:./lib/guava-11.0.2.jar:./lib/guice-3.0.jar:./lib/hadoop-annotations-2.2.0.jar:./lib/hadoop-auth-2.2.0.jar:./lib/hadoop-client-2.2.0.jar:./lib/hadoop-common-2.2.0.jar:./lib/hadoop-hdfs-2.2.0.jar:./lib/hadoop-mapreduce-client-app-2.2.0.jar:./lib/hadoop-mapreduce-client-common-2.2.0.jar:./lib/hadoop-mapreduce-client-core-2.2.0.jar:./lib/hadoop-mapreduce-client-jobclient-2.2.0.jar:./lib/hadoop-mapreduce-client-shuffle-2.2.0.jar:./lib/hadoop-yarn-api-2.2.0.jar:./lib/hadoop-yarn-client-2.2.0.jar:./lib/hadoop-yarn-common-2.2.0.jar:./lib/hadoop-yarn-server-common-2.2.0.jar:./lib/jackson-core-asl-1.8.8.jar:./lib/jackson-jaxrs-1.8.3.jar:./lib/jackson-mapper-asl-1.8.8.jar:./lib/jackson-xc-1.8.3.jar:./lib/javax.inject-1.jar:./lib/javax.servlet-3.1.jar:./lib/javax.servlet-api-3.0.1.jar:./lib/jaxb-api-2.2.2.jar:./lib/jaxb-impl-2.2.3-1.jar:./lib/jersey-client-1.9.jar:./lib/jersey-core-1.9.jar:./lib/jersey-grizzly2-1.9.jar:./lib/jersey-guice-1.9.jar:./lib/jersey-json-1.9.jar:./lib/jersey-server-1.9.jar:./lib/jersey-test-framework-core-1.9.jar:./lib/jersey-test-framework-grizzly2-1.9.jar:./lib/jettison-1.1.jar:./lib/jetty-util-6.1.26.jar:./lib/jsr305-1.3.9.jar:./lib/log4j-1.2.17.jar:./lib/management-api-3.0.0-b012.jar:./lib/myhadoop-wordcount-1.0-SNAPSHOT.jar:./lib/paranamer-2.3.jar:./lib/protobuf-java-2.5.0.jar:./lib/slf4j-api-1.7.5.jar:./lib/slf4j-log4j12-1.7.5.jar:./lib/snappy-java-1.0.4.1.jar:./lib/stax-api-1.0.1.jar:./lib/xmlenc-0.52.jar:./lib/xz-1.0.jar:./lib/zookeeper-3.4.5.jar

设置环境变量

最后运行程序 java -cp $CLASSPATH com.xueyu.MyWordCount  /var/log/syslog /tmp/temp/mavenhadoop/resulta


分析程序就是简单改造的Hadoop example中的WordCount,如下

package com.xueyu;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
//import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


public class MyWordCount {
	
	public static class TokenizerMapper
		extends Mapper<Object, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
		
		public void map(Object key, Text value, Context context)
			throws IOException, InterruptedException
		{
			StringTokenizer itr = new StringTokenizer(value.toString());
			while (itr.hasMoreTokens()) {
				word.set(itr.nextToken());
				context.write(word, one);
			}
		}
	}
	
	public static class IntSumReducer
		extends Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable result = new IntWritable();
		
		public void reduce(Text key, Iterable<IntWritable> values, 
				 				Context context) 
				 				throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);	
		}

	}

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
                String[] otherargs = new GenericOptionsParser(conf, args).getRemainingArgs();
		
		Job job = new Job(conf, "mywordcount");
		//job.setJarByClass(MyWordCount.class);
		job.setJar("/root/xueyu/myhadoop/tmp5/htry.jar");
		job.setMapperClass(TokenizerMapper.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherargs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值