使用的背景和目的比较简单,如果日后有适合的场景,我会把它写下来。目前是一名新手,这里将代码一一贴出来~
1. 主程序
PartitionByStationUsingMultipleOutputs.java
package practice.hadoop.simple_examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 多个文件输出示例
* <h1>目标</h1>
* <p>有多少个气象站,就输出多少个该气象站气温数据。</p>
* @author Henry
* Created at 2017年8月18日
*/
public class PartitionByStationUsingMultipleOutputs extends Configured implements Tool {
// Mapper
static class StationMapper extends Mapper<LongWritable, Text, Text, Text> {
private final NcdcRecordParser mParser = new NcdcRecordParser();
@Override
protected void map(final LongWritable key, final Text value, final Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
mParser.parse(value);
context.write(new Text(mParser.getStationId()), value);
}
}
// Reducer
static class StationReducer extends Reducer<Text, Text, NullWritable, Text> {
MultipleOutputs<NullWritable, Text> multipleOutputs = null;
@Override
protected void setup(final Reducer<Text, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}
@Override
protected void reduce(final Text arg0, final Iterable<Text> arg1, final Reducer<Text, Text, NullWritable, Text>.Context arg2) throws IOException, InterruptedException {
for (final Text value : arg1) {
multipleOutputs.write(NullWritable.get(), value, arg0.toString()/*baseOutputPath*/);
}
}
@Override
protected void cleanup(final Reducer<Text, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
multipleOutputs.close();
}
}
@Override
public int run(final String[] arg0) throws Exception {
final Job job = JobBuilder.parseInputAndOutput(this, getConf(), arg0);
if (job == null) {
return -1;
}
job.setMapperClass(StationMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(StationReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
return (job.waitForCompletion(true) ? 0 : 1);
}
// Main
public static void main(final String[] args) throws Exception {
final int exitCode = ToolRunner.run(new PartitionByStationUsingMultipleOutputs(), args);
System.exit(exitCode);
}
}
2. 工具类
2.1 JobBuilder.java
主要用于主程序运行时预输入/输出参数的基本验证。
package practice.hadoop.simple_examples;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
public final class JobBuilder {
public static Job parseInputAndOutput(final Tool tool, final Configuration conf, final String[] args) throws IOException {
if (args.length != 2) {
System.err.printf("Please note that you have " + args.length + " parameters.");
printUsage(tool, "<inout> <output>");
return null;
}
final Job jobConf = Job.getInstance(conf);
FileInputFormat.addInputPath(jobConf, new Path(args[0]));
FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));
return jobConf;
}
public static void printUsage(final Tool tool, final String extraArgsUsage) {
System.err.printf("Usage: % [genericOptions] %s\n\n", tool.getClass().getSimpleName(), extraArgsUsage);
}
}
2.2 NcdcRecordParser.java
主要用来解析天气数据行。
package practice.hadoop.simple_examples;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.io.Text;
public final class NcdcRecordParser {
private static final int MISSING_TEMPERATURE = 9999;
private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm");
private String stationId;
private String observationDateString;
private String year;
private String airTemperatureString;
private int airTemperature;
private boolean airTemperatureMalformed;
private String quality;
public void parse(final String record) {
stationId = record.substring(4, 10) + "-" + record.substring(10, 15);
observationDateString = record.substring(15, 27);
year = record.substring(15, 19);
airTemperatureMalformed = false;
// Remove leading plus sign as parseInt doesn't like them (pre-Java 7)
if (record.charAt(87) == '+') {
airTemperatureString = record.substring(88, 92);
airTemperature = Integer.parseInt(airTemperatureString);
} else if (record.charAt(87) == '-') {
airTemperatureString = record.substring(87, 92);
airTemperature = Integer.parseInt(airTemperatureString);
} else {
airTemperatureMalformed = true;
}
airTemperature = Integer.parseInt(airTemperatureString);
quality = record.substring(92, 93);
}
public void parse(final Text record) {
parse(record.toString());
}
public boolean isValidTemperature() {
return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE && quality.matches("[01459]");
}
public boolean isMalformedTemperature() {
return airTemperatureMalformed;
}
public boolean isMissingTemperature() {
return airTemperature == MISSING_TEMPERATURE;
}
public String getStationId() {
return stationId;
}
public Date getObservationDate() {
try {
System.out.println(observationDateString);
return DATE_FORMAT.parse(observationDateString);
} catch (final ParseException e) {
throw new IllegalArgumentException(e);
}
}
public String getYear() {
return year;
}
public int getYearInt() {
return Integer.parseInt(year);
}
public int getAirTemperature() {
return airTemperature;
}
public String getAirTemperatureString() {
return airTemperatureString;
}
public String getQuality() {
return quality;
}
}
3. POM.xml
配置主类。
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>practice.hadoop</groupId>
<artifactId>simple-examples</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>simple-examples</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId>
<version>4.12</version> </dependency> -->
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.mrunit</groupId>
<artifactId>mrunit</artifactId>
<version>1.1.0</version>
<classifier>hadoop2</classifier>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-api -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-auth -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-jobclient -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-annotations -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>practice.hadoop.simple_examples.PartitionByStationUsingMultipleOutputs</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
4. 编译
mvn clean
mvn assembly:assembly
5. 上传jar
将target目录的jar文件上传到节点。
6. 数据
/henry/input/weather_multiple.txt
123457798676231190101234567986762311901012345679867623119010123456798676231190101234561+00121534567890356
123455798676231190101234567986762311901012345679867623119010123456798676231190101234562+01122934567890456
123459798676231190201234567986762311901012345679867623119010123456798676231190101234562+02120234567893456
123456708676231190401234567986762311901012345679867623119010123456798676231190101234561+00321234567803456
123456718676231190101234567986762311902012345679867623119010123456798676231190101234561+00429234567903456
123456728676231190501234567986762311902012345679867623119010123456798676231190101234561+01021134568903456
123456738676231190201234567986762311902012345679867623119010123456798676231190101234561+01124234578903456
123456748676231190301234567986762311905012345679867623119010123456798676231190101234561+04121234678903456
123456792276231190301234567986762311905012345679867623119010123456798676231190101234561+00821234678903456
7. 运行
#export HADOOP_CLASSPATH=/software/hadoop-2.8.0/myjars/simple-examples-0.0.1-SNAPSHOT-jar-with-dependencies.jar
#bin/hadoop
practice.hadoop.simple_examples.PartitionByStationUsingMultipleOutputs /henry/input/weather_multiple.txt
/henry/output/multipleoutputs
8. 结果