1.待处理的数据文件格式(部分截图):
生成上图文件的Python源码:https://blog.csdn.net/qq_25948717/article/details/82492962
2.启动hadoop,我是在node40节点上启动的伪分布式集群,在node100节点下开发的,
node40下:start-all.sh
node100下:打开Eclipse 配置好HDFS,据可以开发了:https://blog.csdn.net/qq_25948717/article/details/82347076
jar包:HADOOP_HOME/share/hadoop下的common,hdfs,mapreduce2,yarn文件里面的jar(不用进入子文件里)
都添加到项目中。
本例只需要Mapper类即可。
注意文件数据要上传到hdfs系统上,程序从hdfs上读取,再输出到hdfs。
3.源代码:以序列化格式输出,体现在org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
在这里设置 文件路径:
package hadoop.yexin.mapreduce;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
public class SelectDataBySpecialDate {
public static String HDFS_PATH = "hdfs://node40:9000";
//static FileSystem fileSystem = null;
//static Configuration conf = null;
public static class SelectDatamapper extends Mapper<LongWritable,Text,Text,Text>{
@Override
protected void map(LongWritable key,Text value,Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException,InterruptedException{
String[] val = value.toString().split(",");
//filter the date
if(val[1].contains("2018-02") || val[1].contains("2018-12")){
context.write(new Text(val[0]), new Text(val[1]));
}
}
public static void main(String[] args) throws IOException,InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
//FileSystem.get(new URI(HDFS_PATH),conf);
Job job = Job.getInstance(conf,"SelectData");
job.setJarByClass(SelectDataBySpecialDate.class);
job.setMapperClass(SelectDatamapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileSystem.get(conf).delete(new Path(args[0]),true);
FileOutputFormat.setOutputPath(job,new Path(args[1]));
try {
System.err.println(job.waitForCompletion(true)?-1:1);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}