Haoop实操3-筛选出社交网站特定日期的用户，以序列化格式输出

最新推荐文章于 2024-04-25 11:48:16 发布

大鱼-瓶邪

最新推荐文章于 2024-04-25 11:48:16 发布

阅读量381

点赞数

分类专栏： Java Hadoop

本文链接：https://blog.csdn.net/qq_25948717/article/details/82498053

版权

Hadoop 同时被 2 个专栏收录

58 篇文章 7 订阅

订阅专栏

Java

32 篇文章 0 订阅

订阅专栏

1.待处理的数据文件格式（部分截图）：

生成上图文件的Python源码：https://blog.csdn.net/qq_25948717/article/details/82492962

2.启动hadoop，我是在node40节点上启动的伪分布式集群，在node100节点下开发的,

node40下：start-all.sh

node100下：打开Eclipse 配置好HDFS，据可以开发了：https://blog.csdn.net/qq_25948717/article/details/82347076

jar包：HADOOP_HOME/share/hadoop下的common，hdfs，mapreduce2,yarn文件里面的jar（不用进入子文件里）

都添加到项目中。

本例只需要Mapper类即可。

注意文件数据要上传到hdfs系统上，程序从hdfs上读取，再输出到hdfs。

3.源代码：以序列化格式输出，体现在org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);

在这里设置文件路径：

package hadoop.yexin.mapreduce;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;

public class SelectDataBySpecialDate {

   public static String HDFS_PATH = "hdfs://node40:9000";
   //static FileSystem fileSystem = null;
   //static Configuration conf = null;

   public static class SelectDatamapper extends Mapper<LongWritable,Text,Text,Text>{
       @Override
       protected void map(LongWritable key,Text value,Mapper<LongWritable, Text, Text, Text>.Context context)
               throws IOException,InterruptedException{
           String[] val = value.toString().split(",");
           //filter the date
           if(val[1].contains("2018-02") || val[1].contains("2018-12")){
               context.write(new Text(val[0]), new Text(val[1]));
           }
       }

       public static void main(String[] args) throws IOException,InterruptedException, URISyntaxException {
           Configuration conf = new Configuration();
           //FileSystem.get(new URI(HDFS_PATH),conf);

           Job job = Job.getInstance(conf,"SelectData");
           job.setJarByClass(SelectDataBySpecialDate.class);
           job.setMapperClass(SelectDatamapper.class);
           job.setOutputKeyClass(Text.class);
           job.setOutputValueClass(Text.class);

           job.setInputFormatClass(TextInputFormat.class);
           job.setOutputFormatClass(SequenceFileOutputFormat.class);

           job.setNumReduceTasks(0);

           FileInputFormat.addInputPath(job,new Path(args[0]));
           FileSystem.get(conf).delete(new Path(args[0]),true);
           FileOutputFormat.setOutputPath(job,new Path(args[1]));

           try {
               System.err.println(job.waitForCompletion(true)?-1:1);
           } catch (ClassNotFoundException e) {
               // TODO Auto-generated catch block
               e.printStackTrace();
           }

       }
   }