大数据处理-mapreduce 代码入门实例-多表连接、对单词排序、对数据过滤、统计单词数量并排序、分区统计等

最新推荐文章于 2021-04-21 23:15:07 发布

Mr Gao

最新推荐文章于 2021-04-21 23:15:07 发布

阅读量2.6k

点赞数 5

分类专栏： hadoop 数据分析文章标签： hadoop mapreduce java 大数据

本文链接：https://blog.csdn.net/weixin_43327597/article/details/111224757

版权

数据分析同时被 2 个专栏收录

30 篇文章 0 订阅

订阅专栏

hadoop

13 篇文章 1 订阅

订阅专栏

多表连接

在这里插入图片描述

思路为，通过map阶段将数据按<key，value>进行map，key为id，则shuffle阶段会自动进行组合，但同时对两个表的内容进行标记，进行笛卡尔积时可以进行区分。

代码如下

package org.apache.hadoop.examples; 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;



public class table_lianjie {
    public static class mapper extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            
            //下面两步能获取当前行数据的输入文件名称
            FileSplit fileSplit = (FileSplit) context.getInputSplit();
            String name = fileSplit.getPath().getName();

            //将当前行数据转换为标准的String
            String line = value.toString();
            //若数据无效则丢弃
            if (line == null || line.equals("")) return;
			
            //根据空格进行分割
            String[] split = line.split("\\s+");
            
            if (name.contains("a")) {
                //如果当前行是表一，在city前添加一个标记“#”，以跟表二区分
                String id = split[0];
                String city = split[1];
                //输出key为id，value为city
                context.write(new Text(id), new Text("#" + city));
            } else if (name.contains("b")) {
                //如果当前行是表二，在输出的value字段前添加“$”，以跟表一区分
                String id = split[0];
                String num1 = split[1];
                String num2 = split[2];
              
                context.write(new Text(id), new Text("$" + num1 + "\t" + num2));
            }
        }
    }

    //reducer类
    public static class reducer extends Reducer<Text, Text, Text, Text> {
        //输入的数据为<id,{value1,value2,....}>
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //list1存表一带来的数据
            List<String> list1 = new LinkedList<>();
            //list2存表二带来的数据
            List<String> list2 = new LinkedList<>();
            
            //遍历values
            for (Text text : values) {
                String value = text.toString();
                //如果value数据以#开头，则为表一中的数据，添加至list1中
                if (value.startsWith("#")) {
                    value = value.substring(1);
                    list1.add(value);

                } else if (value.startsWith("$")) {
                    //如果value数据以$开头，则为表二中的数据，添加至list2中
                    value = value.substring(1);
                    list2.add(value);
                }
            }
            
            //将两表id相同的数据进行笛卡尔积，key为id，value为list1与list2的组合
            for (String a : list1) {
                for (String b : list2) {
                    context.write(key, new Text(a + "\t" + b));
                }
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		//下面都是模板，只需修改输入与输出位置即可
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
		
        job.setJarByClass(table_lianjie.class);

        job.setMapperClass(mapper.class);
        job.setReducerClass(reducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("hdfs://lsn-linux:9000/input2"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://lsn-linux:9000/onput_lianjie"));

        System.exit(job.waitForCompletion(true) ? 0 : -1);

    }
}

对单词进行排序:

mapreduce中对数据进行map操作后，shuffle阶段会自动排序，所以意在map阶段处理数据，reduce阶段则直接写入

代码如下：

package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


 
public class WordSort {
 
	// 执行Map
	public static class WordSortMapper
		extends Mapper<LongWritable, Text, Text, Text> {
		
		@Override
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			   String line2 = value.toString();  
			   String line = line2.replaceAll(","," "); 
			   
	            StringTokenizer token = new StringTokenizer(line);
	            while (token.hasMoreTokens()) {
	            	String word = token.nextToken();
	            	Pattern p = Pattern.compile("[A-Za-z]+");
	                Matcher m = p.matcher(word);
	                if (m.find()) {
	                	context.write(new Text(m.group(0)), new Text());
		}
			
                }
		}
public static class IntSumReducer2 extends

Reducer<Text, NullWritable, Text, NullWritable> {
public void reduce(Text key, Iterable<IntWritable> values,

Context context) throws IOException, InterruptedException {

	context.write(key,NullWritable.get());

}

 }
		
	public static void main(String[] args) throws Exception {
		
		
	
		Job job = Job.getInstance();
		job.setJarByClass(WordSort.class);
		job.setJobName("Word Sort");
		// 添加输入输出路径
		FileInputFormat.addInputPath(job, new Path("hdfs://lsn-linux:9000/wordcount/zz.txt"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://lsn-linux:9000/ust9"));
		
		// 设置执行Map和Reduce的类
		job.setMapperClass(WordSortMapper.class);
		job.setReducerClass(IntSumReducer2.class);
		
		//设置输出数据类型
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}
}

对数据进行过滤

对数据过滤，如果是对字符串简单过滤，那么可以直接在map或者reduce中都可以写，但如果，过滤过程较为复杂可以写个函数进行过滤，另外过滤处理可能出现在reduce阶段也可能出现在map阶段，需要根据需求制定代码策略。

代码如下

package org.apache.hadoop.examples;  
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class wordcount_guolv {
public static class TokenizerMapper extends
Mapper<Object, Text, Text, Text>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {

String[] words = value.toString().split(","); 

words[0]="2020年"+words[0];
if(words.length>=5)
	if(!words[1].contains("湖北")) {
	for(int i=1;i<5;i++) {
		
		words[0]=words[0]+","+words[i];
}
	
	context.write(new Text(words[0]), new Text());
	}
}
}
public static class IntSumReducer2 extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Text values,
Context context) throws IOException, InterruptedException {
	context.write(key,new Text());
}
 }
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path mypath = new Path("hdfs://lsn-linux:9000/usr/root");
FileSystem hdfs = mypath.getFileSystem(conf);
if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
}
Job job = Job.getInstance();
job.setJarByClass(wordcount_guolv.class);

job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer2.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

FileInputFormat.addInputPath(job, new Path(
"hdfs://lsn-linux:9000/input/yq.csv"));
FileOutputFormat.setOutputPath(job, new Path(
"hdfs://lsn-linux:9000/ouptput5"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

统计单词数量并排序：

思路：对于mapreduce进行单词数量统计，很简单，我们对数据进行分词，分词之后，每个词语作为key，1作为value，传入之后，reduce阶段，将value进行求和即可。

代码如下：

package org.apache.hadoop.examples;  

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

public static class TokenizerMapper extends

Mapper<Object, Text, Text, IntWritable>


{

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line= value.toString();   
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);

}

}

}
public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {

private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;

for (IntWritable val : values) {

sum += val.get();
}
result.set(sum);

context.write(key, result);

}

 }

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();



Path mypath = new Path("hdfs://lsn-linux:9000/usr/root");

FileSystem hdfs = mypath.getFileSystem(conf);


if (hdfs.isDirectory(mypath)) {

hdfs.delete(mypath, true);

}


Job job = Job.getInstance();

job.setJarByClass(WordCount.class);

job.setMapperClass(TokenizerMapper.class);

job.setCombinerClass(IntSumReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);



FileInputFormat.addInputPath(job, new Path(

"hdfs://lsn-linux:9000/wordcount/zz.txt"));

FileOutputFormat.setOutputPath(job, new Path(

"hdfs://lsn-linux:9000/ust"));
System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

求三个品牌一年一共销售多少部手机

思路：该数据为三个销手机品牌的销售记录，那么很简单，只需三个品牌的手机的销售数量在map阶段都传入value，之后求和即可

package com.sheng.hdfs;


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//计算
//maper计算框架，输出：
class Mapper1 extends Mapper<LongWritable, Text,Text, IntWritable>{
	@Override
	protected void map(LongWritable key,Text values,Context context) throws IOException, InterruptedException {
		//得到每一行的值
		String lines =values.toString();
		//对每一行的字符串按逗号来分
		String[] s= values.toString().split(",");
		//输出：key值和value值
		 context.write(new Text("总销售量"), new IntWritable(Integer.parseInt(s[1])));
	}
}
//Reducer输出到Hadoop中：他的输入是mapper的输出
class WcReduce1 extends Reducer<Text, IntWritable,Text, IntWritable> {
	@Override
	protected void reduce(Text key,Iterable<IntWritable>values,Context context)throws IOException, InterruptedException{
		int sum=0;
		for (IntWritable val:values) {
			sum+=val.get();
		}
		context.write(key,new IntWritable(sum));
		
	}
	
}

计算这三个品牌手机这一年分别销售多少部手机

思路：对于该问题，我们只需要在mao阶段将传入的key进行分类即可，那么在reduce阶段，进行三次统计即可得出结果

package com.sheng.hdfs;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

//计算三个手机这一年的总销量
class Mapper2 extends Mapper<LongWritable, Text, Text, IntWritable>{
	@Override
	protected void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException {
		String lines=value.toString();
		String[] s=lines.split(",");
		//key值是手机品牌名字，vaule值是销售量
		context.write(new Text(s[0]),new IntWritable(Integer.parseInt(s[1])));
		
	}	
}
//Reduce
class Reduce1 extends Reducer<Text, IntWritable, Text, IntWritable>{
	@Override
	protected void reduce(Text key,Iterable<IntWritable>value,Context context) throws IOException,InterruptedException {
		int sum=0;
		for(IntWritable val:value) {
			sum+=val.get();
		}
		context.write(key, new IntWritable(sum));
	}
}
//自定义分区
//注意:分区字段要和value字段相同
class Mypartitioner extends Partitioner<Text, IntWritable>{

	@Override
	public int getPartition(Text key, IntWritable value, int numPrtitons) {
		if (key.toString().equals("xiaomi")) {
			return 0;
		}
		if (key.toString().equals("华为")) {
			return 1;
		}
		 if (key.toString().equals("IP"))
	            return 2;
	        return 3;
	}
	
	
}
public class Home2 {

   	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
   		//
   		Configuration conf = new Configuration();
   		// conf.set("HADOOP_USER_NAME","ambow");

   		// Job对像
   		Job job = Job.getInstance(conf);

   		// 注册Jar驱动类
   		job.setJarByClass(Home2.class);
   		// 注册Mapper驱动类
   		job.setMapperClass(Mapper2.class);
           //注册Reducer驱动类
   		job.setReducerClass(Reduce1.class);

   		// 设置MapOutPut输出的的类型
   		job.setMapOutputKeyClass(Text.class);
   		job.setMapOutputValueClass(IntWritable.class);

   		// 设置最终输出的类型
   		job.setOutputKeyClass(Text.class);
   		job.setOutputValueClass(IntWritable.class);

   		// 设置输入输出路径
   		// org.apache.hadoop.mapred.FileInputFormat 老版本
   		// org.apache.hadoop.mapreduce.lib.input.FileInputFormat 新版本

   		FileInputFormat.setInputPaths(job, new Path("/user/test/data.csv"));
   		FileOutputFormat.setOutputPath(job, new Path("/user/test/data5.csv"));
   //		FileInputFormat.setInputPaths(job, new Path(args[0]));
   //		FileOutputFormat.setOutputPath(job, new Path(args[1]));

   		// 设置reduce任务数为0 分区多少个？？？
   	 job.setPartitionerClass(Mypartitioner.class);
   	 job.setNumReduceTasks(3);
   		// 提交作业
   		boolean result = job.waitForCompletion(true);

   		System.exit(result ? 0 : 1);

   	}
   }

计算每个品牌手机的每个月的销售总量

这时，我们不仅需要将每个品牌区分出来，还需要按月进行划区，之后再reduce阶段，按每个品牌，每个月进行销售量求和即可

package com.sheng.hdfs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/*
 * 统计每个牌子的每个月的销售总量
 */
class WcMapper2 extends Mapper<LongWritable, Text, Text, IntWritable> {
	/*
	 * KeyIn:LongWritable 行的偏移量    ValueIn:Text 这一行的值 TextInputformat
	 * 
	 */

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		// 得到每一行的值，反序化为字符串
		String lines = value.toString();
		// 对每一行的字符串按空格来拆分
		String[] s = value.toString().split(",");
		String str1=s[2];
		
		String[] s1 = str1.toString().split("月");
		String str3=s1[0];
		
		String str2=str3.substring(5);
		
		// 对每个单词写入Hadoop中 写入的数据必须是Hadoop的序列化
		context.write(new Text(str2+"月"+s[0]), new IntWritable(Integer.parseInt(s[1])));		
         // hello:1    word:1  aaaa:1  空格 :1  空格 :1    空格 :1 
	}
}
class WcReduce2 extends Reducer<Text, IntWritable, Text, IntWritable> {
	// reduce(单词key, 指定的单词mapper统计的List, Context context)
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {

		int sum = 0;
         for (IntWritable val : values) {
             sum += val.get();
         }
         context.write(key, new IntWritable(sum));

}
}
class MyPartitioner1 extends Partitioner<Text, IntWritable> {

    //转发给4个不同的reducer
	  //转发给4个不同的reducer
    @Override
    public int getPartition(Text key, IntWritable value, int numPartitons) {
        if (key.toString().equals("xiaomi"))
            return 0;
        if (key.toString().equals("huawei"))
            return 1;
        if (key.toString().equals("iphone7"))
            return 2;
        return 3;
    }
}
public class Home3 {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		//
		Configuration conf = new Configuration();
		// conf.set("HADOOP_USER_NAME","ambow");

		// Job对像
		Job job = Job.getInstance(conf);
		// 注册Jar驱动类
		job.setJarByClass(Home3.class);
		// 注册Mapper驱动类
		job.setMapperClass(WcMapper2.class);
       //注册Reducer驱动类
		job.setReducerClass(WcReduce2.class);

		// 设置MapOutPut输出的的类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		// 设置最终输出的类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		// 设置输入输出路径
		// org.apache.hadoop.mapred.FileInputFormat 老版本
		// org.apache.hadoop.mapreduce.lib.input.FileInputFormat 新版本

		FileInputFormat.setInputPaths(job, new Path("/user/test/data.csv"));
		FileOutputFormat.setOutputPath(job, new Path("/user/test/data6.csv"));
//		FileInputFormat.setInputPaths(job, new Path(args[0]));
//		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 设置reduce任务数为0 分区多少个？？？
		job.setPartitionerClass(MyPartitioner1.class);
		//job.setNumReduceTasks(3);
		 

		// 提交作业
		boolean result = job.waitForCompletion(true);

		System.exit(result ? 0 : 1);

	}
}

Mr Gao

关注

5
点赞
踩
26

收藏

觉得还不错? 一键收藏
4
评论
大数据处理-mapreduce 代码入门实例-多表连接、对单词排序、对数据过滤、统计单词数量并排序、分区统计等

多表连接思路为，通过map阶段将数据按<key，value>进行map，key为id，则shuffle阶段会自动进行组合，但同时对两个表的内容进行标记，进行笛卡尔积时可以进行区分。代码如下package org.apache.hadoop.examples; import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWr.
复制链接

扫一扫