- 多表连接
思路为,通过map阶段将数据按<key,value>进行map,key为id,则shuffle阶段会自动进行组合,但同时对两个表的内容进行标记,进行笛卡尔积时可以进行区分。
代码如下
package org.apache.hadoop.examples;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
public class table_lianjie {
public static class mapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//下面两步能获取当前行数据的输入文件名称
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String name = fileSplit.getPath().getName();
//将当前行数据转换为标准的String
String line = value.toString();
//若数据无效则丢弃
if (line == null || line.equals("")) return;
//根据空格进行分割
String[] split = line.split("\\s+");
if (name.contains("a")) {
//如果当前行是表一,在city前添加一个标记“#”,以跟表二区分
String id = split[0];
String city = split[1];
//输出key为id,value为city
context.write(new Text(id), new Text("#" + city));
} else if (name.contains("b")) {
//如果当前行是表二,在输出的value字段前添加“$”,以跟表一区分
String id = split[0];
String num1 = split[1];
String num2 = split[2];
context.write(new Text(id), new Text("$" + num1 + "\t" + num2));
}
}
}
//reducer类
public static class reducer extends Reducer<Text, Text, Text, Text> {
//输入的数据为<id,{value1,value2,....}>
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//list1存表一带来的数据
List<String> list1 = new LinkedList<>();
//list2存表二带来的数据
List<String> list2 = new LinkedList<>();
//遍历values
for (Text text : values) {
String value = text.toString();
//如果value数据以#开头,则为表一中的数据,添加至list1中
if (value.startsWith("#")) {
value = value.substring(1);
list1.add(value);
} else if (value.startsWith("$")) {
//如果value数据以$开头,则为表二中的数据,添加至list2中
value = value.substring(1);
list2.add(value);
}
}
//将两表id相同的数据进行笛卡尔积,key为id,value为list1与list2的组合
for (String a : list1) {
for (String b : list2) {
context.write(key, new Text(a + "\t" + b));
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//下面都是模板,只需修改输入与输出位置即可
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(table_lianjie.class);
job.setMapperClass(mapper.class);
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://lsn-linux:9000/input2"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://lsn-linux:9000/onput_lianjie"));
System.exit(job.waitForCompletion(true) ? 0 : -1);
}
}
- 对单词进行排序:
mapreduce中 对数据进行map操作后,shuffle阶段会自动排序,所以意在map阶段处理数据,reduce阶段则直接写入
代码如下:
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordSort {
// 执行Map
public static class WordSortMapper
extends Mapper<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line2 = value.toString();
String line = line2.replaceAll(","," ");
StringTokenizer token = new StringTokenizer(line);
while (token.hasMoreTokens()) {
String word = token.nextToken();
Pattern p = Pattern.compile("[A-Za-z]+");
Matcher m = p.matcher(word);
if (m.find()) {
context.write(new Text(m.group(0)), new Text());
}
}
}
public static class IntSumReducer2 extends
Reducer<Text, NullWritable, Text, NullWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJarByClass(WordSort.class);
job.setJobName("Word Sort");
// 添加输入输出路径
FileInputFormat.addInputPath(job, new Path("hdfs://lsn-linux:9000/wordcount/zz.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://lsn-linux:9000/ust9"));
// 设置执行Map和Reduce的类
job.setMapperClass(WordSortMapper.class);
job.setReducerClass(IntSumReducer2.class);
//设置输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
}
- 对数据进行过滤
对数据过滤,如果是对字符串简单过滤,那么可以直接在map或者reduce中都可以写,但如果,过滤过程较为复杂可以写个函数进行过滤,另外过滤处理可能出现在reduce阶段也可能出现在map阶段,需要根据需求制定代码策略。
代码如下
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class wordcount_guolv {
public static class TokenizerMapper extends
Mapper<Object, Text, Text, Text>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] words = value.toString().split(",");
words[0]="2020年"+words[0];
if(words.length>=5)
if(!words[1].contains("湖北")) {
for(int i=1;i<5;i++) {
words[0]=words[0]+","+words[i];
}
context.write(new Text(words[0]), new Text());
}
}
}
public static class IntSumReducer2 extends
Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Text values,
Context context) throws IOException, InterruptedException {
context.write(key,new Text());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path mypath = new Path("hdfs://lsn-linux:9000/usr/root");
FileSystem hdfs = mypath.getFileSystem(conf);
if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
}
Job job = Job.getInstance();
job.setJarByClass(wordcount_guolv.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer2.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(
"hdfs://lsn-linux:9000/input/yq.csv"));
FileOutputFormat.setOutputPath(job, new Path(
"hdfs://lsn-linux:9000/ouptput5"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
- 统计单词数量并排序:
思路:对于mapreduce进行单词数量统计,很简单,我们对数据进行分词,分词之后,每个词语作为key,1作为value,传入之后,reduce阶段,将value进行求和即可。
代码如下:
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper extends
Mapper<Object, Text, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line= value.toString();
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path mypath = new Path("hdfs://lsn-linux:9000/usr/root");
FileSystem hdfs = mypath.getFileSystem(conf);
if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
}
Job job = Job.getInstance();
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(
"hdfs://lsn-linux:9000/wordcount/zz.txt"));
FileOutputFormat.setOutputPath(job, new Path(
"hdfs://lsn-linux:9000/ust"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
- 求三个品牌一年一共销售多少部手机
思路:该数据为三个销手机品牌的销售记录,那么很简单,只需三个品牌的手机的销售数量在map阶段都传入value,之后求和即可
package com.sheng.hdfs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//计算
//maper计算框架,输出:
class Mapper1 extends Mapper<LongWritable, Text,Text, IntWritable>{
@Override
protected void map(LongWritable key,Text values,Context context) throws IOException, InterruptedException {
//得到每一行的值
String lines =values.toString();
//对每一行的字符串按逗号来分
String[] s= values.toString().split(",");
//输出:key值和value值
context.write(new Text("总销售量"), new IntWritable(Integer.parseInt(s[1])));
}
}
//Reducer输出到Hadoop中:他的输入是mapper的输出
class WcReduce1 extends Reducer<Text, IntWritable,Text, IntWritable> {
@Override
protected void reduce(Text key,Iterable<IntWritable>values,Context context)throws IOException, InterruptedException{
int sum=0;
for (IntWritable val:values) {
sum+=val.get();
}
context.write(key,new IntWritable(sum));
}
}
- 计算这三个品牌手机这一年分别销售多少部手机
思路:对于该问题,我们只需要在mao阶段将传入的key进行分类即可,那么在reduce阶段,进行三次统计即可得出结果
package com.sheng.hdfs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//计算三个手机这一年的总销量
class Mapper2 extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException {
String lines=value.toString();
String[] s=lines.split(",");
//key值是手机品牌名字,vaule值是销售量
context.write(new Text(s[0]),new IntWritable(Integer.parseInt(s[1])));
}
}
//Reduce
class Reduce1 extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key,Iterable<IntWritable>value,Context context) throws IOException,InterruptedException {
int sum=0;
for(IntWritable val:value) {
sum+=val.get();
}
context.write(key, new IntWritable(sum));
}
}
//自定义分区
//注意:分区字段要和value字段相同
class Mypartitioner extends Partitioner<Text, IntWritable>{
@Override
public int getPartition(Text key, IntWritable value, int numPrtitons) {
if (key.toString().equals("xiaomi")) {
return 0;
}
if (key.toString().equals("华为")) {
return 1;
}
if (key.toString().equals("IP"))
return 2;
return 3;
}
}
public class Home2 {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//
Configuration conf = new Configuration();
// conf.set("HADOOP_USER_NAME","ambow");
// Job对像
Job job = Job.getInstance(conf);
// 注册Jar驱动类
job.setJarByClass(Home2.class);
// 注册Mapper驱动类
job.setMapperClass(Mapper2.class);
//注册Reducer驱动类
job.setReducerClass(Reduce1.class);
// 设置MapOutPut输出的的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置最终输出的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
// org.apache.hadoop.mapred.FileInputFormat 老版本
// org.apache.hadoop.mapreduce.lib.input.FileInputFormat 新版本
FileInputFormat.setInputPaths(job, new Path("/user/test/data.csv"));
FileOutputFormat.setOutputPath(job, new Path("/user/test/data5.csv"));
// FileInputFormat.setInputPaths(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 设置reduce任务数为0 分区多少个???
job.setPartitionerClass(Mypartitioner.class);
job.setNumReduceTasks(3);
// 提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
- 计算每个品牌手机的每个月的销售总量
这时,我们不仅需要将每个品牌区分出来,还需要按月进行划区,之后再reduce阶段,按每个品牌,每个月进行销售量求和即可
package com.sheng.hdfs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 统计每个牌子的每个月的销售总量
*/
class WcMapper2 extends Mapper<LongWritable, Text, Text, IntWritable> {
/*
* KeyIn:LongWritable 行的偏移量 ValueIn:Text 这一行的值 TextInputformat
*
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 得到每一行的值,反序化为字符串
String lines = value.toString();
// 对每一行的字符串按空格来拆分
String[] s = value.toString().split(",");
String str1=s[2];
String[] s1 = str1.toString().split("月");
String str3=s1[0];
String str2=str3.substring(5);
// 对每个单词写入Hadoop中 写入的数据必须是Hadoop的序列化
context.write(new Text(str2+"月"+s[0]), new IntWritable(Integer.parseInt(s[1])));
// hello:1 word:1 aaaa:1 空格 :1 空格 :1 空格 :1
}
}
class WcReduce2 extends Reducer<Text, IntWritable, Text, IntWritable> {
// reduce(单词key, 指定的单词mapper统计的List, Context context)
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
class MyPartitioner1 extends Partitioner<Text, IntWritable> {
//转发给4个不同的reducer
//转发给4个不同的reducer
@Override
public int getPartition(Text key, IntWritable value, int numPartitons) {
if (key.toString().equals("xiaomi"))
return 0;
if (key.toString().equals("huawei"))
return 1;
if (key.toString().equals("iphone7"))
return 2;
return 3;
}
}
public class Home3 {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//
Configuration conf = new Configuration();
// conf.set("HADOOP_USER_NAME","ambow");
// Job对像
Job job = Job.getInstance(conf);
// 注册Jar驱动类
job.setJarByClass(Home3.class);
// 注册Mapper驱动类
job.setMapperClass(WcMapper2.class);
//注册Reducer驱动类
job.setReducerClass(WcReduce2.class);
// 设置MapOutPut输出的的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置最终输出的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
// org.apache.hadoop.mapred.FileInputFormat 老版本
// org.apache.hadoop.mapreduce.lib.input.FileInputFormat 新版本
FileInputFormat.setInputPaths(job, new Path("/user/test/data.csv"));
FileOutputFormat.setOutputPath(job, new Path("/user/test/data6.csv"));
// FileInputFormat.setInputPaths(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 设置reduce任务数为0 分区多少个???
job.setPartitionerClass(MyPartitioner1.class);
//job.setNumReduceTasks(3);
// 提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}