环境:
CentOS 6.5, Eclipse 4.4.2, Hadoop 1.1.2
任务目标:从数据源抽取指定的字段,并统计出错行数
test2.txt
二、程序编写
CentOS 6.5, Eclipse 4.4.2, Hadoop 1.1.2
任务目标:从数据源抽取指定的字段,并统计出错行数
一、数据源准备
在hdfs://vm1:9000/user/hadoop/in目录中上传了两个数据文件,test1.txt和test2.txt
内容如下:
test1.txt
MAY 12:10:12 192.158.202 calvin
THR 11:22:23 192.168.22.3 james
THR 22:33:22 192.155.23.22 john
FRI 23:22:12 158.129.234.23 kate
LL
DDI 23:11:33 192.168.11.10 frame
test2.txt
EIG 12:10:12 192.158.202 calvin
OCT 11:22:23 192.168.22.3 james
NUM 22:33:22 192.155.23.22 john
SEC 23:22:12 158.129.234.23 kate
二、程序编写
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Test_1 extends Configured implements Tool {
enum Counter {
LINE_SKIP
}
public static class Map_1 extends Mapper<LongWritable, Text, NullWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
try {
String[] lineSplits = line.split(" ");
String month = lineSplits[0];
String time = lineSplits[1];
String ip = lineSplits[2];
Text out = new Text(month + " " + time + " " + ip);
context.write(NullWritable.get(), out);
} catch (Exception e) {
// e.printStackTrace();
context.getCounter(Counter.LINE_SKIP).increment(1);
}
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = this.getConf();
Job job = new Job(conf, "Test_1");
job.setJarByClass(Test_1.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(Map_1.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
return job.isSuccessful() ? 1 : 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new Test_1(), args);
System.exit(res);
}
}
三、在Eclipse中运行程序
Run As -> Run Configuration -> Arguments选项卡 -> 在Program Arguments中填入hdfs://vm1:9000/user/hadoop/in hdfs://vm1:9000/user/hadoop/out -> Run
四、运行结果
在hdfs://vm1:9000/user/hadoop/out/part-r-00000文件中查看结果
EIG 12:10:12 192.158.202
OCT 11:22:23 192.168.22.3
NUM 22:33:22 192.155.23.22
SEC 23:22:12 158.129.234.23
MAY 12:10:12 192.158.202
THR 11:22:23 192.168.22.3
THR 22:33:22 192.155.23.22
FRI 23:22:12 158.129.234.23
DDI 23:11:33 192.168.11.10