hadoop05--(mapreduce增强)

最新推荐文章于 2022-06-08 15:33:50 发布

时空恋旅人

最新推荐文章于 2022-06-08 15:33:50 发布

阅读量222

点赞数

分类专栏：离线计算

本文链接：https://blog.csdn.net/weixin_37650458/article/details/85642599

版权

离线计算专栏收录该内容

6 篇文章 0 订阅

订阅专栏

1.mapreduce的join运算

需求：

订单数据表t_order：

id	date	pid	amount
1001	20150710	P0001	2
1002	20150710	P0001	3
1002	20150710	P0002	3

商品信息表t_product

id	pname	category_id	price
P0001	小米5	1000	2
P0002	锤子T1	1000	3

假如数据量巨大，两表的数据是以文件的形式存储在HDFS中，需要用mapreduce程序来实现一下SQL查询运算：

select  a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

Id date pname category_id price

1001 20150710 小米5 1000 2

1002 20150710 小米5 1000 2

1002 20150710 锤子T1 1000 3

2.实现方法：

采用面向对象的思想首先将文件封装成对象，这个对象的属性就是需要查询的字段，对象还需要一个属性来表明是哪一个文件，在map中根据文件名字区分不同的文件提取想要的字段，然后再在reduce端进行组合。

懒得启动虚拟机，本例子就在本地运行

首先写一个驱动类：

package com.wx.mapreduce3;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


/**
 * 订单表和商品表合到一起
 order.txt(订单id, 日期, 商品编号, 数量)
 1001	20150710	P0001	2
 1002	20150710	P0001	3
 1002	20150710	P0002	3
 1003	20150710	P0003	3
 product.txt(商品编号, 商品名字, 价格, 数量)
 P0001	小米5	1001	2
 P0002	锤子T1	1000	3
 P0003	锤子	1002	4

 */
public class RJoin {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //本地运行
        conf.set("mapreduce.framework.name", "local");
        Job job = Job.getInstance(conf);

        // 指定本程序的jar包所在的本地路径
        job.setJarByClass(RJoin.class);
		job.setJar("f:\\learnhadoop-1.0-SNAPSHOT.jar");
        // 指定本业务job要使用的mapper/Reducer业务类
        job.setMapperClass(RJoinMapper.class);
        job.setReducerClass(RJoinReducer.class);

        // 指定mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(InfoBean.class);

        // 指定最终输出的数据的kv类型
        job.setOutputKeyClass(InfoBean.class);
        job.setOutputValueClass(NullWritable.class);

        // 指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        // 指定job的输出结果所在目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
        /* job.submit(); */
        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);

    }
}

bean:

package com.wx.mapreduce3;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class InfoBean implements Writable {

    private int order_id;
    private String dateString;
    private String p_id;
    private int amount;
    private String pname;
    private int category_id;
    private float price;

    // flag=0表示这个对象是封装订单表记录
    // flag=1表示这个对象是封装产品信息记录
    private String flag;

    public InfoBean() {
    }

    public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, String flag) {
        this.order_id = order_id;
        this.dateString = dateString;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.category_id = category_id;
        this.price = price;
        this.flag = flag;
    }

    public int getOrder_id() {
        return order_id;
    }

    public void setOrder_id(int order_id) {
        this.order_id = order_id;
    }

    public String getDateString() {
        return dateString;
    }

    public void setDateString(String dateString) {
        this.dateString = dateString;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public int getCategory_id() {
        return category_id;
    }

    public void setCategory_id(int category_id) {
        this.category_id = category_id;
    }

    public float getPrice() {
        return price;
    }

    public void setPrice(float price) {
        this.price = price;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    /**
     * private int order_id; private String dateString; private int p_id;
     * private int amount; private String pname; private int category_id;
     * private float price;
     */

    public void write(DataOutput out) throws IOException {
        out.writeInt(order_id);
        out.writeUTF(dateString);
        out.writeUTF(p_id);
        out.writeInt(amount);
        out.writeUTF(pname);
        out.writeInt(category_id);
        out.writeFloat(price);
        out.writeUTF(flag);

    }


    public void readFields(DataInput in) throws IOException {
        this.order_id = in.readInt();
        this.dateString = in.readUTF();
        this.p_id = in.readUTF();
        this.amount = in.readInt();
        this.pname = in.readUTF();
        this.category_id = in.readInt();
        this.price = in.readFloat();
        this.flag = in.readUTF();

    }

    @Override
    public String toString() {
        return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag;
    }


}

mapper:

package com.wx.mapreduce3;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
import java.io.UnsupportedEncodingException;

public class RJoinMapper  extends Mapper<LongWritable, Text, Text, InfoBean> {
    InfoBean bean = new InfoBean();
    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        Text newvalue = transformTextToUTF8(value, "GBK");
        String line = newvalue.toString();
        System.out.println(line);
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        System.out.println(inputSplit);
        String name = inputSplit.getPath().getName();
        /***通过文件名判断是哪种数据*/
        String pid = "";
        if (name.startsWith("order")) {
            String[] fields = line.split(",");
            // id date pid amount
            pid = fields[2];
            bean.set(Integer.parseInt(fields[0]), fields[1], pid, Integer.parseInt(fields[3]), "", 0, 0, "0");

        } else {
            String[] fields = line.split(",");
            // id pname category_id price
            pid = fields[0];
            bean.set(0, "", pid, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1");

        }
        k.set(pid);
        context.write(k, bean);
    }
    public static Text transformTextToUTF8(Text text, String encoding) {
        String value = null;
        try {
            value = new String(text.getBytes(), 0, text.getLength(), encoding);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return new Text(value);
    }
}

reduce:

package com.wx.mapreduce3;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;

public class RJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable> {

    @Override
    protected void reduce(Text pid, Iterable<InfoBean> beans, Context context) throws IOException, InterruptedException {
        InfoBean pdBean = new InfoBean();
        ArrayList<InfoBean> orderBeans = new ArrayList<InfoBean>();

        for (InfoBean bean : beans) {
            if ("1".equals(bean.getFlag())) {	//产品的
                try {
                    BeanUtils.copyProperties(pdBean, bean);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            } else {
                InfoBean odbean = new InfoBean();
                try {
                    BeanUtils.copyProperties(odbean, bean);
                    orderBeans.add(odbean);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

        }

        // 拼接两类数据形成最终结果
        for (InfoBean bean : orderBeans) {

            bean.setPname(pdBean.getPname());
            bean.setCategory_id(pdBean.getCategory_id());
            bean.setPrice(pdBean.getPrice());

            context.write(bean, NullWritable.get());
        }
    }
}

配置启动参数：

maven 打包：mvn clean package

准备数据：

运行：

刚刚开始出现中文乱码：

Hadoop处理GBK文本时,发现输出出现了乱码,原来HADOOP在涉及编码时都是写死的UTF-8，如果文件编码格式是其它类型（如GBK)，则会出现乱码。

此时只需在mapper或reducer程序中读取Text时，使用transformTextToUTF8(text, "GBK");进行一下转码，以确保都是以UTF-8的编码方式在运行。

想要在本地运行必须将windos下编译好的hadoop配置到环境变量中，具体配置：

https://www.cnblogs.com/wxplmm/p/7253934.html

然后启动参数配置好，启动后报错

Could not locate executable null\bin\winutils.exe in the Hadoop binaries.

可能需要重启系统，重启后可以

2.数据倾斜问题：

Map端join算法，就可以解决数据全局问题，但是产品表不能太大，产品表就放在mapreduce 提供的缓存机制中

package com.wx.mapreduce4;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Mapjoin {

    /**
     * 利用mapreduce的缓存机制在map端进行多表的join运算
     */

    static class MapJoinmap extends Mapper<LongWritable, Text, Text, NullWritable>
    {
        //用一个hashmap来保存加载产品的信息
        Map<String, String> pdinfo=new HashMap<String, String>();
        //通过阅读mapper源码发现，setup方法是maptask处理数据之前调用一次，可以用来做一些初始化 的工作
        @Override
        protected void setup( Context context) throws IOException, InterruptedException {
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("pds.txt")));
            String line;
            while(StringUtils.isNotEmpty(line=br.readLine()))
            {
                String[] split = line.split(",");
                pdinfo.put(split[0], split[1]);
            }
            br.close();

        }
        Text k=new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String[] split = value.toString().split(",");
            String name = pdinfo.get(split[1]);
            k.set(value.toString()+"\t"+name);
            context.write(k, NullWritable.get());
        }
    }
    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        Configuration conf=new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(Mapjoin.class);
        //设置执行map的类以及输出的数据的类型
        job.setMapperClass(MapJoinmap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        //设置读取处理文件的路径
        FileInputFormat.setInputPaths(job, new Path("F:/wordcount/mapjoininput"));
        FileOutputFormat.setOutputPath(job, new Path("F:/wordcount/mapjoinoutput"));
        //指定一个文件需要缓存到maptask的工作目录
        //job.addArchiveToClassPath(archive);//缓存jar包到maptask运行节点的classpath中
        //job.addFileToClassPath(file);//缓存普通文件到maptask的运行节点的calsspath中
        //job.addCacheArchive(uri);//缓存压缩包文件到calsspath中
        job.addCacheFile(new URI("file:/F:/wordcount/mapjoincache/pds.txt"));//将产品表的产品文件缓存到task的工作目录中去

        //设置不需要reduce
        job.setNumReduceTasks(0);

        boolean re = job.waitForCompletion(true);
        System.exit(re?0:1);
    }

}

运行程序，系统报错：

java.lang.Exception: java.io.FileNotFoundException: pds.txt (系统找不到指定的文件。)

仔细找到错误：

failed 1 with: CreateSymbolicLink error (1314): ???????????

Failed to create symlink: \tmp\hadoop-cxdn\mapred\local\1537718956274\pds.txt <- F:\JAVAEE_jdk1.7\workspacemars2\workspacebigdata2\mapperreduce/pds.txt

网上解决办法：

https://blog.csdn.net/wangzy0327/article/details/78786410

1. Hadoop 毕竟是LINUX的东西，所以安装目录（包括JDK）都别用太特别的字符或者空格； 2. CreateSymbolicLink error (1314): A required privilege is not held by the client. Windows 的执行权限问题，望这方面找就对了。网上找到的具体解决办法：

1.打开本地策略(按win+R 打开 cmd，输入 gpedit.msc（win10家庭版没有这个，必须要其专业或者企业版才有）)

2.打开本地计算机策略 - window设置 - 安全设置 - 本地策略 - 用户权限配置

3.找到创建符号链接（右侧），查看是否你的用户在里面(启动idea或者eclipse 运行程序的用户)，如果不在添加进去

4.需要重启后才能生效

实在不行可以用管理员身份运行程序

仔细排查错误，发现不能运行成功的原因是eclipse运行的账户cxdn没有权限，当我添加权限以后就运行成功了

3.求出共同好友

思路：
   此题目计算共同的QQ好友，分两步走，首先找到
   第一步 map
   读一行 A:B,C,D,F,E,O
   输出
   <B,A><C,A><D,A><F,A><E,A><O,A>
   再读一行 B:A,C,E,K
   输出 <A,B><C,B><E,B><K,B>

<B,A><C,A><D,A><F,A><E,A><O,A>

<A,B><C,B><E,B><K,B>

。。。。。。。。。。

读完过后key相同的聚在一起
<C,A><C,B> 。。。

....
这样就找到A和B的好友中都有C C: A,B ...

然后再来一个mapreduce:
   C是A,B,....的共同好友
   读一行：C:A,B,.....
   拿到的数据是上一个步骤的输出结果
   A   I,H,K,D,F,G,C,O,B,

I,H,K,D,F,G,C,O,B, 进行两两组合

<A-B C> <A-B E>

package com.wx.mapreduce6;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Fensi1 {

	/*
	 * 此题目计算共同的QQ好友，分两步走，首先找到 
	 * 第一步 map 
	 * 读一行 A:B,C,D,F,E,O 
	 * 输出
	 * <B,A><C,A><D,A><F,A><E,A><O,A> 
	 * 在读一行 B:A,C,E,K 
	 * 输出 <A,B><C,B><E,B><K,B>
	 * 
	 */
	static class Fensi1Mappper extends Mapper<LongWritable, Text, Text, Text>
	{
		Text k=new Text();
		Text v=new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] split = value.toString().split(":");
			v.set(split[0]);
			String[] split2 = split[1].split(",");
			for(String val:split2)
			{
				k.set(val);
				context.write(k, v);
			}
		}
	}
	//<B,A><C,A><D,A><F,A><E,A><O,A> <A,B><C,B><E,B><K,B>
	//<C,A><C,B> <c,D><C,F> ....
	//这样就找到A和B的好友中都有C  C: A,B  ...
	//
	static class Fensi1Reduce extends Reducer<Text, Text, Text, Text>
	{
		Text v=new Text();
		@Override
		protected void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {
			//String str="";
			//使用高效字符串拼接
			StringBuilder sb=new StringBuilder();
			for(Text v1:value) //value是A,B,D,F现在需要两两组合,但是在这里做拼接不好拼，所以去下一个map里面拼接
			{
				sb.append(v1).append(",");								
			}
			v.set(sb.toString());
			context.write(key, v);									
		}
	}
	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
        Configuration conf=new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(Fensi1.class);
        job.setMapperClass(Fensi1Mappper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        
        job.setReducerClass(Fensi1Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        FileInputFormat.setInputPaths(job, new Path("F:/bigdata/fensi/input"));
        FileOutputFormat.setOutputPath(job, new Path("F:/bigdata/fensi/output1"));
        
        boolean re = job.waitForCompletion(true);
        System.exit(re?0:1);
	}

}

package com.wx.mapreduce6;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.wx.mapreduce6.Fensi1.Fensi1Mappper;
import com.wx.mapreduce6.Fensi1.Fensi1Reduce;
public class Fensi2 {
	/*
	 * C是A,B,D,F的共同好友
	 * 读一行：C:A,B,D,F
	 * 拿到的数据是上一个步骤的输出结果
	 * A	I,H,K,D,F,G,C,O,B,
	 */
	static class Fensi2Mappper extends Mapper<LongWritable, Text, Text, Text>
	{
		Text k=new Text();
		Text v=new Text();		
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] split = value.toString().split("\t");
			v.set(split[0]);
			String[] split2 = split[1].split(",");
			Arrays.sort(split2);
			//I,H,K,D,F,G,C,O,B, 进行两两组合
			for(int i=0;i<split2.length-1;i++)
			{
				for(int j=i+1;j<split2.length-1;j++)
				{
					k.set(split2[i]+"-"+split2[j]);
					context.write(k, v);
				}
			}
		}
	}
	//<A-B C> <A-B E>
	static class Fensi2Reduce extends Reducer<Text, Text, Text, Text>
	{
		Text v=new Text();
		@Override
		protected void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {			
			StringBuilder sb=new StringBuilder();
			for(Text value:values)
			{
				sb.append(value).append(",");
				v.set(sb.toString());
			}
			context.write(key, v);
		}
	}
	public static void main(String[] args) throws Exception {	

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(Fensi2.class);
		job.setMapperClass(Fensi2Mappper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setReducerClass(Fensi2Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		FileInputFormat.setInputPaths(job, new Path("F:/bigdata/fensi/output1"));
		FileOutputFormat.setOutputPath(job, new Path("F:/bigdata/fensi/output2"));

		boolean re = job.waitForCompletion(true);
		System.exit(re ? 0 : 1);
	}
}

结果：