hadoop06--(mapreduce增强)

最新推荐文章于 2022-06-10 19:27:27 发布

时空恋旅人

最新推荐文章于 2022-06-10 19:27:27 发布

阅读量183

点赞数 1

分类专栏：大数据离线计算

本文链接：https://blog.csdn.net/weixin_37650458/article/details/85725043

版权

大数据同时被 2 个专栏收录

12 篇文章 1 订阅

订阅专栏

离线计算

6 篇文章 0 订阅

订阅专栏

1.倒排索引的建立：

需求：有大量的文本（文档、网页），需要建立搜索索引

计算每个单词在每个文件的出现次数并且将他们排序

创建好输入的文件：

思路：一行一行的读，拿到单词，并且拿到文件名字，将单词和文件名字合并在一起作为key输出，然后在reduce端统计即可

第一次输出的结果：

第二次的输出结果：

代码：

package com.wx.mapreduce5;
import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ReverseIndexOne {
    /*
     * 倒排索引例子，计算每个单词在每个文件的出现次数并且将他们排序
     */
    static class ReverseIndexOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>
    {
        Text k=new Text();
        IntWritable v=new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(" ");//shift+alt+L
            //拿到文件名字
            FileSplit inputSplit = (FileSplit) context.getInputSplit();
            String name = inputSplit.getPath().getName();
            //拼接key，key是单词和文件名的合体
            for(String word: split)
            {
                k.set(word+"--"+name);
                context.write(k, v);
            }
        }
    }
    /*
     * 输入：<hello--a.txt,1> <hello--a.txt,1> <hello--a.txt,1> <hello--b.txt,1> <hello--b.txt,1>
     * 输出：<hello--a.txt,3> <hello--b.txt,2>
     */
    static class ReverseIndexOneReduce extends Reducer<Text, IntWritable, Text, IntWritable>
    {
        IntWritable v=new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
            int count=0;
            for(IntWritable val:values)
            {
                count+=val.get();
                v.set(count);
            }
            context.write(key, v);
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf=new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(ReverseIndexOne.class);
        //设置执行map的类以及输出的数据的类型
        job.setMapperClass(ReverseIndexOneMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //执行reduce的类以及最终输出结果
        job.setReducerClass(ReverseIndexOneReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置读取处理文件的路径
        FileInputFormat.setInputPaths(job, new Path("F:/bigdata/reverseindex/steponeinput"));
        FileOutputFormat.setOutputPath(job, new Path("F:/bigdata/reverseindex/steponeoutput1"));
        //指定一个文件需要缓存到maptask的工作目录
        //job.addArchiveToClassPath(archive);//缓存jar包到maptask运行节点的classpath中
        //job.addFileToClassPath(file);//缓存普通文件到maptask的运行节点的calsspath中
        //job.addCacheArchive(uri);//缓存压缩包文件到calsspath中
        //job.addCacheFile(new URI("file:/F:/wordcount/mapjoincache/pds.txt"));//将产品表的产品文件缓存到task的工作目录中去

        //设置不需要reduce
        //job.setNumReduceTasks(0);

        boolean re = job.waitForCompletion(true);
        System.exit(re?0:1);
    }

}

package com.wx.mapreduce5;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ReverseIndexTwo {

    /*  把步骤一处理好的数据如下，进行拼接
     * guofucheng--b.txt	1
       guofucheng--b.txt	2
       hai--c.txt	1
       hello--a.txt	1
       hello--a.txt	2
     */
    static class ReverseIndexTwoMapper extends Mapper<LongWritable, Text, Text, Text>
    {
        Text k=new Text();
        Text v=new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String[] split = value.toString().split("--");
            k.set(split[0]);
            v.set(split[1]);
            context.write(k, v);
        }
    }
    //<guofucheng,b.txt	1> <guofucheng,b.txt	2>
    static class ReverseIndexTwoReduce extends Reducer<Text, Text, Text, Text>
    {
        Text k=new Text();
        Text v=new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            String str="";
            for(Text value:values)//把value拼接起来即可
            {
                str+=value;
                v.set(str);
            }
            k.set(key);
            context.write(k,v);
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf=new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(ReverseIndexTwo.class);
        //设置执行map的类以及输出的数据的类型
        job.setMapperClass(ReverseIndexTwoMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //执行reduce的类以及最终输出结果
        job.setReducerClass(ReverseIndexTwoReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        //设置读取处理文件的路径
        FileInputFormat.setInputPaths(job, new Path("F:/bigdata/reverseindex/steponeoutput1"));
        FileOutputFormat.setOutputPath(job, new Path("F:/bigdata/reverseindex/steponeoutput2"));
        //指定一个文件需要缓存到maptask的工作目录
        //job.addArchiveToClassPath(archive);//缓存jar包到maptask运行节点的classpath中
        //job.addFileToClassPath(file);//缓存普通文件到maptask的运行节点的calsspath中
        //job.addCacheArchive(uri);//缓存压缩包文件到calsspath中
        //job.addCacheFile(new URI("file:/F:/wordcount/mapjoincache/pds.txt"));//将产品表的产品文件缓存到task的工作目录中去
        //设置不需要reduce
        //job.setNumReduceTasks(0);
        boolean re = job.waitForCompletion(true);
        System.exit(re?0:1);
    }
}

2.有如下订单数据，现在需要求出每一个订单中成交金额最大的一笔交易

1、利用“订单id和成交金额”作为key，可以将map阶段读取到的所有订单数据按照id分区，按照金额排序，发送到reduce

2、在reduce端利用groupingcomparator将订单id相同的kv聚合成组，然后取第一个即是最大值

经过map处理过后利用partitioner按照订单id分区，所有的id相同的订单bean分在一起，并且按钱的大小排好了顺序，但是要保证对象进入同一个reduce还需要WritableComparator将分在一个的bean安排竟日同一个reduce,依据就是把key本来是比对象进入reduce,现在把他按比id进入reduce,这样他们就会按照相同的订单号进入同一个reduce,并且已经是排好序的，直接输出最大即可。

第一种分区HashPartitioner:

HashPartitioner的分区数=k.hashcode%numReduceTasks 而numReduceTasks 是用户自己设定的，job.setnumReduceTasks(5) ,比如设为5，那么k.hashcode模5的结果就可能是0，1，2，3，4所以共五个分区。

第二种分区，ProvincePartitioner:

比如说事先定义了六个省，那么就一定会有6个分析，numReduceTasksde 的数量就不能不设，不能瞎设，必须要给前面的对的上

准备输入数据：

输出的结果：

代码：

package com.wx.mapreduce7;

import java.io.IOException;

import org.apache.avro.Schema.Field.Order;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OrderSort {

	/*
	 * 读入一行数据Order_0000001	Pdt_01	222.8
	 * 输出<bran,null>,经过分区排序的
	 * <Order_0000001 222.8,null><Order_0000001 100,null><Order_0000002 100,null>
	 */
	static class OrderSortMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>
	{
		OrderBean k=new OrderBean();
		Text itemid=new Text();
		DoubleWritable price=new DoubleWritable();
		@Override
		protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
			String[] split = value.toString().split(",");
			itemid.set(split[0]);
			k.setItemid(itemid);
			price.set(Double.parseDouble(split[2]));
			k.setPrice(price);
			context.write(k, NullWritable.get());
		}
	}
	/*
	 * 
	 */
	static class OrderSortReduce extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>
	{
		@Override
		protected void reduce(OrderBean bean, Iterable<NullWritable> value,Context context)
				throws IOException, InterruptedException {
			context.write(bean, NullWritable.get());
		}
	}
	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
         Configuration conf=new Configuration();
         Job job = Job.getInstance(conf);
         job.setJarByClass(OrderSort.class);
         job.setMapperClass(OrderSortMapper.class);
         job.setReducerClass(OrderSortReduce.class);
         
         //map的输出数据类型
         job.setMapOutputKeyClass(OrderBean.class);
         job.setMapOutputValueClass(NullWritable.class);
         //指定最终输出数据的类型
         job.setOutputKeyClass(OrderBean.class);
         job.setOutputValueClass(NullWritable.class);
         //指定文件输入路径
         FileInputFormat.setInputPaths(job, new Path("F:/bigdata/ordersort/input"));
         FileOutputFormat.setOutputPath(job, new Path("F:/bigdata/ordersort/output"));
         
       //在此设置自定义的Groupingcomparator类 
 		job.setGroupingComparatorClass(ItemidGroupingComparator.class);
 		//在此设置自定义的partitioner类
 		job.setPartitionerClass(ItemIdPartitioner.class);
        job.setNumReduceTasks(1);
		
		job.waitForCompletion(true);
         
	}

}

package com.wx.mapreduce6;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class OrderBean implements WritableComparable<OrderBean> {

    private Text itemid;
    private DoubleWritable price;
    //分区会调反射的方法
    public OrderBean()
    {

    }
    public OrderBean(Text itemid,DoubleWritable price)
    {
        this.itemid=itemid;
        this.price=price;
    }
    public Text getItemid() {
        return itemid;
    }

    public void setItemid(Text itemid) {
        this.itemid = itemid;
    }

    public DoubleWritable getPrice() {
        return price;
    }

    public void setPrice(DoubleWritable price) {
        this.price = price;
    }

    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        this.itemid=new Text(in.readUTF());
        this.price=new DoubleWritable(in.readDouble());
    }

    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
        out.writeUTF(itemid.toString());
        out.writeDouble(price.get());
    }
    public int compareTo(OrderBean o) {
        //比较订单号，如果订单号相同，则按价钱倒序排序出来,如果不同，返回1就是按订单号正序排序
        int com = this.itemid.compareTo(o.getItemid());
        if(com==0)
        {
            com=-this.price.compareTo(o.getPrice());
        }
        return com;
    }
    @Override
    public String toString() {
        // TODO Auto-generated method stub
        return itemid.toString()+":"+price.get();
    }
}

package com.wx.mapreduce6;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class ItemIdPartitioner extends Partitioner<OrderBean, NullWritable> {
    @Override
    public int getPartition(OrderBean bean, NullWritable value, int numReduceTasks) {
        // TODO Auto-generated method stub
        //相同id的订单bean，会发往相同的partition
        //而且，产生的分区数，是会跟用户设置的reduce task数保持一致
        return (bean.getItemid().hashCode() & Integer.MAX_VALUE) % numReduceTasks;
    }
}

package com.wx.mapreduce6;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class ItemidGroupingComparator extends WritableComparator  {

    /*
     * 利用reduce端的GroupingComparator来实现将一组bean看成相同的key
     * 因为bean是一个对象如果直接传到reduce，那个每个对象就是不同的，所以用该类
     *
     */
    //传入作为key的bean的class类型，以及制定需要让框架做反射获取实例对象
    protected ItemidGroupingComparator() {
        super(OrderBean.class, true);
    }
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        // TODO Auto-generated method stub
        OrderBean abean = (OrderBean) a;
        OrderBean bbean = (OrderBean) b;

        //比较两个bean时，指定只比较bean中的orderid
        return abean.getItemid().compareTo(bbean.getItemid());
    }
}

3.日志清洗：运营商流量日志增强

现有一些原始日志需要做增强解析处理，流程：

从原始日志文件中读取数据
根据日志中的一个URL字段到外部知识库中获取信息增强到原始日志
如果成功增强，则输出到增强结果目录；如果增强失败，则抽取原始数据中URL字段输出到待爬清单目录

时空恋旅人

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
hadoop06--(mapreduce增强)

1.倒排索引的建立：需求：有大量的文本（文档、网页），需要建立搜索索引计算每个单词在每个文件的出现次数并且将他们排序创建好输入的文件：思路：一行一行的读，拿到单词，并且拿到文件名字，将单词和文件名字合并在一起作为key输出，然后在reduce端统计即可第一次输出的结果：第二次的输出结果：代码： package com.wx.m...
复制链接

扫一扫