Hadoop项目实战之多类型输入_hadoop多个input-CSDN博客

`Hadoop`项目实战之多类型输入

1.背景

在日常的需求中，可能遇到的问题是：如果MapReduce job的任务需要的数据源在不同的文件中怎么办？例如：需要从文本文件user.txt中得到一批数据；从另外一个文本文件transactions.txt（可以不是文本，也可以是数据库等数据源）中得到另一批数据。分别将这两批数据作为mapper的数据源，那么该怎么实现呢？
Hadoop也是一个成熟的项目了（和如下的FCB一样，也应该学会自己合并Mapper的输出了）。
在这里插入图片描述

很高兴，Hadoop 已经学会了自己合并mapper输出了。它有一个类MultipleInputs，表示的是：多类型输入，可以详见我的博客：MultipleInputs详解
下面给出一个示例，用于演示如何使用Mulitple

2.示例

2.1 需求

现在有两个文本文件，分别是user.txt，transactions.txt，虽然都是文本文件，但是它们的内容格式并不相同。

users.txt中存储的信息格式是<user_id, location_id>
transactions.txt中存储的信息格式是<timestamp product user_id price number>
先需要提取出 user_id 对应的location_id 和 product_id。例如，针对如下的数据：

[root@server4 hadoop]# hdfs dfs -cat /input/users.txt
u1 UT
u1 GA
u3 CA
u3 CA
u5 GA
[root@server4 hadoop]# hdfs dfs -cat /input/transactions.txt
t1 p3 u1 3 330
t2 p1 u2 1 400
t3 p1 u1 3 600
t4 p2 u2 10 1000
t5 p4 u4 9 90
t6 p1 u1 4 120
t7 p4 u1 8 160
t8 p4 u5 2 40

我们想要的结果是：

[root@server4 hadoop]# hdfs dfs -cat /output/leftJoin/part-r-00000
u1	UT,p4,p1,p1,p3,
u2	GA,p2,p1,
u3	CA,
u4	CA,p4,
u5	GA,p4,

如果两者的文件类型，及格式都一样的话，那么我们就用不着使用MultipleInputs了，因为setMapInput()就可以解决。但是如果文件类型和格式并不完全相同，那么就需要使用MultipleInputs类。

3.代码

3.1 `LeftJoinDriver`类

package data_algorithm.chapter_4;

import data_algorithm.utils.HdfsUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
 * 1.MapReduce 1的实现目标是：将所有的user_id相同的 user 信息，已经卖出的product信息放在一起，并且是先放user的信息，再放product的信息
 * 例如，对于数据记录：
 * user_id ,location
 * 1       ,shanghai
 *
 * user_id ,product_id
 * 1       ,disk
 * 1       ,mouse
 * 那么期望得到的结果是：
 * user_id,location, list[product_id]
 * 1      ,shanghai, [disk,mouse]
 *
 */
public class LeftJoinDriver extends Configured implements Tool {
    public static void main(String[] args) throws Exception {
        if (args.length != 3) {
            System.exit(1);
        }
        HdfsUtils.deletePath(args[2]);
        int returnStatus = ToolRunner.run(new LeftJoinDriver(), args);
        System.exit(returnStatus);
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(LeftJoinDriver.class);

        Path users = new Path(args[0]);
        Path transactions = new Path(args[1]);

        //利用MultipleInputs类可以使用多个Mapper 类；但是如果只有单个Mapper类，那么只需要使用Job.setMapperClass()即可
        MultipleInputs.addInputPath(job,
                users,
                TextInputFormat.class,
                UserMapper.class);

        MultipleInputs.addInputPath(job,
                transactions,
                TextInputFormat.class,
                TransactionMapper.class);

        //set output file path
        FileOutputFormat.setOutputPath(job,new Path(args[2]));

        job.setMapOutputKeyClass(User.class);
        job.setMapOutputValueClass(Text.class);
        job.setPartitionerClass(UserPartitioner.class);

        job.setGroupingComparatorClass(SecondarySortGroupComparator.class);
        job.setReducerClass(UserReducer.class);
        //job.setNumReduceTasks(3);
        boolean status = job.waitForCompletion(true);
        return status? 0: 1;
    }
}

3.2 `UserMapper`类

package data_algorithm.chapter_4;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 1.用于对用户信息的Mapper
 */
public class UserMapper extends Mapper<LongWritable,Text,User,Text>{

    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line [] = value.toString().split(" ");
        String user_id = line[0];

        String user_loc = line[1];
        //写入一个user对象 -> 其中product_name 字段为空
        User user = new User(user_id,1,user_loc,"empty");

        //输出的格式是：user user_loc
        context.write(user,new Text(user_loc));
        //System.out.println("UserMapper End...");
    }
}

3.3 `TransactionMapper`类

package data_algorithm.chapter_4;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 1.用于对用户信息的Mapper
 */
public class TransactionMapper extends Mapper<LongWritable,Text,User,Text>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line [] = value.toString().split(" ");
        String user_prd = line[1];
        String user_id = line[2];

        //构建一个user对象，其中location为空
        User user = new User(user_id,2,"empty",user_prd);
        //输出的格式是 user user_prd
        context.write(user, new Text(user_prd));
    }
}

3.4 `UserPartitioner`类

package data_algorithm.chapter_4;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class UserPartitioner extends Partitioner<User,Text> {
        //这里的numPartitions 是什么值？在哪里传参？
        public int getPartition(User user, Text text, int numPartitions) {
            return Math.abs(user.getUser_id().hashCode() % numPartitions);
        }
}

3.5 `SecondarySortGroupComparator`

package data_algorithm.chapter_4;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class SecondarySortGroupComparator extends WritableComparator {

    //这个构造参数一定要实现，否则会报错
    public SecondarySortGroupComparator() {
        super(User.class,true);
    }   
    @Override
    public int compare(WritableComparable wc1, WritableComparable wc2) {
        User user1 = (User)wc1;
        User user2 = (User)wc2;

        return user1.getUser_id().compareTo(user2.getUser_id());
    }
}

3.6 `UserReducer`类

package data_algorithm.chapter_4;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class UserReducer extends Reducer<User,Text,Text,Text> {
    @Override
    protected void reduce(User key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        for (Text tx : values) {
            sb.append(tx.toString()) .append( ",");
        }
        System.out.println("key："+key+",value: "+sb.toString());
        context.write(new Text(key.getUser_id()),new Text(sb.toString()));
    }
}

3.7 `User`类

package data_algorithm.chapter_4;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class User implements Writable, WritableComparable<User>{
    private String user_id;
    private int level;//the level of 1 or 2
    private String location;//user location
    private String product_name;//the product name of user buy

    //为什么需要添加无参的构造函数
    public User() {
    }

    public User(String user_id, int level, String location, String product_name) {
        this.user_id = user_id;
        this.level = level;
        this.location = location;
        this.product_name = product_name;
    }

    public String getUser_id() {
        return user_id;
    }

    public void setUser_id(String user_id) {
        this.user_id = user_id;
    }

    public String getLocation() {
        return location;
    }

    public void setLocation(String location) {
        this.location = location;
    }

    public String getProduct_name() {
        return product_name;
    }

    public void setProduct_name(String product_name) {
        this.product_name = product_name;
    }

    public int getLevel() {
        return level;
    }

    public void setLevel(int level) {
        this.level = level;
    }

    public int compareTo(User user) {
        int compareValue = this.getUser_id().compareTo(user.getUser_id());
        if (compareValue == 0) {
            return this.getLevel() - user.getLevel();  //先出现 location, 再出现 product
            //return user.getLevel() - this.getLevel(); //先出现product , 再出现 location
        }
        return compareValue;
    }

    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.user_id);
        out.writeInt(this.level);
        out.writeUTF(this.location);
        out.writeUTF(this.product_name);
    }

    public void readFields(DataInput in) throws IOException {
        this.user_id = in.readUTF();
        this.level = in.readInt();
        this.location = in.readUTF();
        this.product_name = in.readUTF();
    }

    @Override
    public int hashCode() {
        int result = this.getUser_id() != null ? this.getUser_id().hashCode() : 0;
        //result = 31 * result + level ;
        //不能使用这个+ level 值的，否则会将相同user_id 的分到不同的区？
        if (this.getUser_id().equals("u1")) {
            System.out.println("u1");
            return 1;
        }
        if (this.getUser_id().equals("u2")) {
            System.out.println("u2");
            return 2;
        }
        return result;
    }

    @Override
    public String toString() {
        return this.getUser_id();
    }
}

4.运行结果

[root@server4 hadoop]# hdfs dfs -cat /output/leftJoin/part-r-00000
u1	UT,p4,p1,p1,p3,
u2	GA,p2,p1,
u3	CA,
u4	CA,p4,
u5	GA,p4,

5.注意事项

有如下几点需要注意的：

如果是自定义的Key,Value，必须手动实现Writable接口，这个类是Hadoop中十分关键的类，用于序列化；例如上述的User类，就实现了Writable类
如果想让自定义的Key,Value用排序的功能，则必须实现WritableComparable接口，因为这个接口中有compareTo()方法，用于排序。

6. 常见报错

开发这个项目的过程中，遇见了如下几个问题：

6.1 `hadoop java.lang.RuntimeException: java.lang.NoSuchMethodException`

报错

hadoop java.lang.RuntimeException: java.lang.NoSuchMethodException

报错原因
报这个错，是因为实现Writable的类缺少无参构造器。【实现 Writable 的类必须要有自己的实现类】

6.2

报错

java.lang.RuntimeException: java.io.EOFException
	at org.apache.hadoop.io.WritableComparator.compare(WritableComparator.java:164) ~[hadoop-common-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.compare(MapTask.java:1265) ~[hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.util.QuickSort.sortInternal(QuickSort.java:74) ~[hadoop-common-2.6.4.jar:na]
	at org.apache.hadoop.util.QuickSort.sort(QuickSort.java:63) ~[hadoop-common-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1593) ~[hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1482) ~[hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:720) ~[hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask.closeQuietly(MapTask.java:2012) [hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:794) [hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341) [hadoop-mapreduce-client-core-2.6.4.jar:na]
	at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:243) [hadoop-mapreduce-client-common-2.6.5.jar:na]
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [na:1.8.0_77]
	at java.util.concurrent.FutureTask.run(FutureTask.java:266) [na:1.8.0_77]
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_77]
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_77]
	at java.lang.Thread.run(Thread.java:745) [na:1.8.0_77]
Caused by: java.io.EOFException: null
	at java.io.DataInputStream.readFully(DataInputStream.java:197) ~[na:1.8.0_77]
	at java.io.DataInputStream.readUTF(DataInputStream.java:609) ~[na:1.8.0_77]
	at java.io.DataInputStream.readUTF(DataInputStream.java:564) ~[na:1.8.0_77]
	at data_algorithm.chapter_4.User.readFields(User.java:79) ~[classes/:na]
	at org.apache.hadoop.io.WritableComparator.compare(WritableComparator.java:158) ~[hadoop-common-2.6.4.jar:na]
	... 15 common frames omitted

报错原因
在实现WritableComparator接口的类中，丢失了一个构造器，导致无法继续运行。

public class SecondarySortGroupComparator extends WritableComparator {
    
    //这个构造参数一定要实现，否则会报错
    public SecondarySortGroupComparator() {
        super(User.class,true);
    }
	····
}

7.

在整个开发过程中，得到的结果也并非是一帆风顺的。如果将SecondarySortGroupComparator类中的compare()方法改成如下的样子：

    @Override
    public int compare(WritableComparable wc1, WritableComparable wc2) {        
        return 1;
    }

那么得到的结果就是：

[root@server4 hadoop]# hdfs dfs -cat hdfs://server4:9000/output/leftJoin/part-r-00000
u1	UT,
u2	GA,
u3	CA,
u4	CA,
u5	GA,
u2	p2,
u2	p1,
u4	p4,
u1	p4,
u1	p1,
u1	p1,
u1	p3,
u5	p4,

这是因为每个 user id 都作为了一组。所以即使是 user_id 相同的数据也没能放在一起；同理如果将其方法修改成如下的样子：

    @Override
    public int compare(WritableComparable wc1, WritableComparable wc2) {        
        return 0;
    }

得到的输出结果如下：

[root@server4 mapreduce]# hdfs dfs -cat  /output/leftJoin/part-r-00000
u5	UT,GA,CA,CA,GA,p2,p1,p4,p4,p1,p1,p3,p4,
[root@server4 mapreduce]#

可以看到这里的结果是将所有的 user_id 都分在了一组，导致输出的时候只有一个user_id，然后一堆value。
该compareTo()方法正确的形式应该是：

public int compare(WritableComparable wc1, WritableComparable wc2) {
        User user1 = (User)wc1;
        User user2 = (User)wc2;
        return user1.getUser_id().compareTo(user2.getUser_id());
    }