本例子采用hadoop1.1.2版本,附件中有例子的数据文件
采用气象数据作为处理数据
1、MultipleOutputs例子,具体解释在代码中有注释
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
package
StationPatitioner;
import
java.io.IOException;
import
java.util.Iterator;
import
org.apache.hadoop.conf.Configured;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.io.LongWritable;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapred.FileInputFormat;
import
org.apache.hadoop.mapred.FileOutputFormat;
import
org.apache.hadoop.mapred.JobClient;
import
org.apache.hadoop.mapred.JobConf;
import
org.apache.hadoop.mapred.MapReduceBase;
import
org.apache.hadoop.mapred.Mapper;
import
org.apache.hadoop.mapred.OutputCollector;
import
org.apache.hadoop.mapred.Reducer;
import
org.apache.hadoop.mapred.Reporter;
import
org.apache.hadoop.mapred.TextOutputFormat;
import
org.apache.hadoop.mapred.lib.MultipleOutputs;
import
org.apache.hadoop.mapred.lib.NullOutputFormat;
import
org.apache.hadoop.util.Tool;
import
org.apache.hadoop.util.ToolRunner;
/**
* hadoop Version 1.1.2
* MultipleOutputs例子
* @author 巧克力黑
*
*/
public
class
PatitionByStationUsingMultipleOutputs
extends
Configured
implements
Tool {
enum
Counter
{
LINESKIP,
//出错的行
}
static
class
StationMapper
extends
MapReduceBase
implements
Mapper<LongWritable , Text, Text , Text>{
private
NcdcRecordParser parser =
new
NcdcRecordParser();
@Override
public
void
map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter)
throws
IOException {
try
{
parser.parse(value);
output.collect(
new
Text(parser.getStationid()), value);
}
catch
(Exception e) {
reporter.getCounter(Counter.LINESKIP).increment(
1
);
//出错令计数器+1
}
}
}
static
class
MultipleOutputReducer
extends
MapReduceBase
implements
Reducer<Text, Text, NullWritable, Text>{
private
MultipleOutputs multipleOutputs;
@Override
public
void
configure(JobConf jobconf) {
multipleOutputs =
new
MultipleOutputs(jobconf);
//初始化一个MultipleOutputs
}
@Override
public
void
reduce(Text key, Iterator<Text> values,
OutputCollector<NullWritable, Text> output, Reporter reporter)
throws
IOException {
//得到OutputCollector
OutputCollector collector = multipleOutputs.getCollector(
"station"
, key.toString().replace(
"-"
,
""
), reporter);
while
(values.hasNext()){
collector.collect(NullWritable.get(), values.next());
//MultipleOutputs用OutputCollector输出数据
}
}
@Override
public
void
close()
throws
IOException {
multipleOutputs.close();
}
}
@Override
public
int
run(String[] as)
throws
Exception {
System.setProperty(
"HADOOP_USER_NAME"
,
"root"
);
//windows下用户与linux用户不一直,采用此方法避免报Permission相关错误
JobConf conf =
new
JobConf();
conf.setMapperClass(StationMapper.
class
);
conf.setReducerClass(MultipleOutputReducer.
class
);
conf.setMapOutputKeyClass(Text.
class
);
conf.setOutputKeyClass(NullWritable.
class
);
conf.setOutputFormat(NullOutputFormat.
class
);
FileInputFormat.setInputPaths(conf,
new
Path(
"hdfs://ubuntu:9000/sample1.txt"
));//input路径
FileOutputFormat.setOutputPath(conf,
new
Path(
"hdfs://ubuntu:9000/temperature"
));//output路径
MultipleOutputs.addMultiNamedOutput(conf,
"station"
, TextOutputFormat.
class
, NullWritable.
class
, Text.
class
);
JobClient.runJob(conf);
return
0
;
}
public
static
void
main(String[] args)
throws
Exception{
int
exitCode = ToolRunner.run(
new
PatitionByStationUsingMultipleOutputs(), args);
System.exit(exitCode);
}
}
|
2、解析气象数据的类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
package
StationPatitioner;
import
org.apache.hadoop.io.Text;
public
class
NcdcRecordParser {
private
static
final
int
MISSING_TEMPERATURE =
9999
;
private
String year;
private
int
airTemperature;
private
String quality;
private
String stationid;
public
void
parse(String record) {
stationid = record.substring(
0
,
5
);
year = record.substring(
15
,
19
);
String airTemperatureString;
// Remove leading plus sign as parseInt doesn't like them
if
(record.charAt(
87
) ==
'+'
) {
airTemperatureString = record.substring(
88
,
92
);
}
else
{
airTemperatureString = record.substring(
87
,
92
);
}
airTemperature = Integer.parseInt(airTemperatureString);
quality = record.substring(
92
,
93
);
}
public
String getStationid(){
return
stationid;
}
public
void
parse(Text record) {
parse(record.toString());
}
public
boolean
isValidTemperature() {
return
airTemperature != MISSING_TEMPERATURE && quality.matches(
"[01459]"
);
}
public
String getYear() {
return
year;
}
public
int
getAirTemperature() {
return
airTemperature;
}
}
|
本文转自巧克力黒 51CTO博客,原文链接http://blog.51cto.com/10120275/1639389:,如需转载请自行联系原作者