Hadoop数据缓存

最新推荐文章于 2024-11-03 01:12:52 发布

罗刹海是市式市世视士

最新推荐文章于 2024-11-03 01:12:52 发布

阅读量45

点赞数

文章标签： hadoop 缓存大数据

本文链接：https://blog.csdn.net/berbai/article/details/132567601

版权

（一）MapJoinMapper

代码

package cn.china.kb23.demo4;

import cn.china.kb23.demo3.CustomerOrders;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;

import java.util.ArrayList;
/*
【缓存信息】
*/
public class MapJoinMapper extends Mapper<LongWritable, Text, CustomerOrders, NullWritable> {
    //私有的
    private ArrayList<CustomerOrders> list=new ArrayList<CustomerOrders>();
    //添加setup和map方法
    @Override
    //setup获取顾客信息
    protected void setup(Context context) throws IOException, InterruptedException {
        URI[] cacheFiles = context.getCacheFiles();
        for (URI uri :cacheFiles) {
            System.out.println(uri.getPath());
            String currentFileName = new Path(uri).getName();
            if (currentFileName.startsWith("customers")){
                String path=uri.getPath();
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
                String line;
                while ((line=br.readLine())!=null){
                    String[] fields = line.split(",");
                    CustomerOrders customerOrders=new CustomerOrders(Integer.parseInt(fields[0]),
                            fields[1]+fields[2],0,"","");
                    list.add(customerOrders);//添加
                }
            }
        }
    }
    @Override
    //map阶段已经获得所有顾客的信息
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //订单信息获取分割
        String[] orderFields = value.toString().split(",");
        //获取客户id，转换
        int custmerId = Integer.parseInt(orderFields[2]);
        //定义customerOrders
        CustomerOrders customerOrder = new CustomerOrders();
//        CustomerOrders crrentCustomer = new CustomerOrders();
        //判断如果顾客id和订单中id相等
        for (CustomerOrders customer : list) {
            if (custmerId==customer.getCustomerId()){
                customerOrder=customer;
            }
        }
        //------------------------------------塞值
        //客户名称需要判断是否为空
        if(customerOrder!=null){
            //不为空，赋值
            customerOrder.setCustomerName(customerOrder.getCustomerName());
        }else {
            //为空,输出空值
            customerOrder.setCustomerName("");
        }
        //订单状态
        customerOrder.setOrderStatus(orderFields[3]);
        //订单id
        customerOrder.setOrderId(Integer.parseInt(orderFields[0]));
        //客户id
        customerOrder.setCustomerId(Integer.parseInt(orderFields[2]));
        //信息表
        customerOrder.setFlag("1");
        //输出
        context.write(customerOrder,NullWritable.get());
    }
}

（二）MapJoinDriver

代码

package cn.china.kb23.demo4;

import cn.china.kb23.demo3.CustomerOrders;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/*
【缓存信息】
*/
public class MapJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //启动开始时间程序
        long start = System.currentTimeMillis();
        //1
        Configuration conf = new Configuration();
        //2
        Job job = Job.getInstance(conf);
        //3
        job.setJarByClass(MapJoinDriver.class);
        //4
        job.setMapperClass(MapJoinMapper.class);
        job.setMapOutputKeyClass(CustomerOrders.class);
        job.setMapOutputValueClass(NullWritable.class);
        //5输入路径
        Path inPath = new Path("E:\\springboot\\kb23\\in\\demo3\\orders.csv");
        FileInputFormat.setInputPaths(job, inPath);
        //6输出路径
        Path outpath = new Path("E:\\springboot\\kb23\\out\\outdemo4");
        FileSystem fs=FileSystem.get(outpath.toUri(),conf);
        //7判断如果存在删除
        if(fs.exists(outpath)){
            fs.delete(outpath,true);
        }
        FileOutputFormat.setOutputPath(job,outpath);
        //8设置job Reduce阶段任务数量
        job.setNumReduceTasks(0);
        //9添加缓存
        Path cachepath = new Path("E:\\springboot\\kb23\\in\\demo3\\customers.csv");
        job.addCacheFile(cachepath.toUri());//缓存空间不大，所以数据不能太大！！！！！！！
        //10输出
        job.waitForCompletion(true);
        //启动结束时间程序
        long end = System.currentTimeMillis();
        //输出运行时间
        System.out.println("运行时间："+(end-start));

    }
}