任务:
1.查找相同字母组成的单词
一本英文书籍包含成千上万个单词,现在我们需要在大量的单词中,找出相同字母组成的所有单词。
2.编写程序实现对输入文件的排序
现在有多个输入文件,每个文件中的每行内容均为一个整数。要求读取所有文件中的整数,进行升序排序后,输出到一个新的文件中,输出的数据格式为每行两个整数,第一个数字为第二个整数的排序位次,第二个整数为原待排列的整数。
数据集:
1.查找相同字母组成的单词
cat
tar
bar
act
rat2.编写程序实现对输入文件的排序
file1.txt:
33
37
12
40file2.txt:
4
16
39
5file3.txt:
1
45
25
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Anagram{
/** 排序、分组*/
public static class AnagramMapper extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key, Text value,Context context) throws IOException ,
InterruptedException {
String text = value.toString(); // 将Text转换成String
char[] textCharArr = text.toCharArray();//将String转换成字符数组,为排序作准备
Arrays.sort(textCharArr); // 使用 Arrays对数组进行排序
String sortedText = new String(textCharArr); // 排序后的字符串
context.write(new Text(sortedText), value);
}}
/** 统计相同字母组成的单词*/
public static class AnagramReduce extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
StringBuilder value = new StringBuilder(); // 值
int count = 0; // 计数
for(Text text : values){ // 拼接单词
if(value.length() > 0){ // 分割符,
value.append(",");
}
value.append(text);
count++; // 计数
}
//因为要统计相同字母组成的单词,所以相同字母组成的单词个数大于等于2才会输出
if(count > 1){
context.write(key, new Text(value.toString()));
}
}
}
public static void main(String[] args) throws Exception {
Configuration configuration=new Configuration();
Path inpath=new Path("hdfs://192.168.109.125:8020/input");
Path outpath=new Path("hdfs://192.168.109.125:8020/output");
Job job=Job.getInstance(configuration);
job.setJarByClass(Anagram.class);
// 指定mapper、reduce
job.setMapperClass(AnagramMapper.class);
job.setReducerClass(AnagramReduce.class);
// 指定mapper、reduce的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 指定输入、输出目录
FileInputFormat.addInputPath(job, inpath);
FileOutputFormat.setOutputPath(job, outpath);
//提交作业并等待执行完成。
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class NumberSort { public static class Map extends Mapper<Object, Text, IntWritable, IntWritable> { private static IntWritable data = new IntWritable(); public void map(Object key,Text value,Context context)throws IOException, InterruptedException { String line = value.toString(); System.out.println("line:"+ line); context.write(new IntWritable(Integer.parseInt(value.toString())),new IntWritable(1)); } } public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { private static IntWritable linenumber = new IntWritable(1); @Override protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { for (IntWritable val : values) { context.write(linenumber, key); linenumber = new IntWritable(linenumber.get() + 1); } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); Job job=Job.getInstance(conf); job.setJarByClass(NumberSort.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path("hdfs://192.168.109.125:8020/input")); FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.109.125:8020/output")); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
输出结果:
1.查找相同字母组成的单词
act act,cat
art rat,tar2.编写程序实现对输入文件的排序
1 1
2 4
3 5
4 12
5 16
6 25
7 33
8 37
9 39
10 40
11 45