1.在hdfs目录/tmp/input/wordcount目录中有一系列文件,内容为","号分隔,分隔后的元素均为数值类型、字母、中文,求数值类型、字母类型、中文类型各自的次数
public class Fileone {
static class FileoneMapper extends Mapper<Object, Text,Text, IntWritable> {
private Text word=new Text();
private IntWritable one =new IntWritable(1);
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer str=new StringTokenizer(value.toString(),",");
while (str.hasMoreTokens()){
String item=str.nextToken();
if (TypeUtil.isNumeric(item)){
word.set("数字");
context.write(word,one);
}else if (TypeUtil.isChineseStr(item)){
word.set("中文字符");
context.write(word,one);
}else {
word.set("字母类型");
context.write(word,one);
}
}
}
}
static class FileoneReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
private IntWritable value =new IntWritable(1);
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable val: values){
sum+=val.get();
}
value.set(sum);
context.write(key ,value);
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
String output =args[1];
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration, "zrf");
job.setJarByClass(Fileone.class);
job.setMapperClass(FileoneMapper.class);
job.setReducerClass(FileoneReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<Path> list= FilePathUtil.readFile(args[0]);
for (Path path:list
) {
FileInputFormat.addInputPath(job,path);
}
FileOutputFormat.setOutputPath(job, new Path(output));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
用到的判断类型的工具类
package com.zrf.MapDriver;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TypeUtil {
public static boolean isNumeric(String str){
Pattern pattern = Pattern.compile("[0-9]*");
return pattern.matcher(str).matches();
}
public static boolean isEnglishStr(String charaString){
return charaString.matches("^[a-zA-Z]*");
}
public static boolean isChineseStr(String str){
String regEx = "[\\u4e00-\\u9fa5]+";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
if(m.find()) {
return true;
} else {
return false;
}
}
}
读取文件夹及其以下文件夹内的文件地址
public static String readFile2(String path) throws Exception{
FileInputStream is = new FileInputStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
String line="";
StringBuilder result = new StringBuilder();
boolean flag = true;
while ((line = br.readLine()) != null) {
if (flag) {
flag=false;
}else {
result.append(",");
}
result.append(line);
}
return result.toString();
}
2.在hdfs目录/tmp/tl/input/wordcount目录中有一系列文件,内容为","号分隔,同时在hdfs路径/tmp/tl/black.txt黑名单文件,一行一个单词用于存放不记入统计的单词列表。求按","号分隔的各个元素去除掉黑名单后的出现频率,输出到目录/tmp/tl/output/个人用户名的hdfs目录中。
public class FileTwo {
static class FileTwoMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text outputk = new Text("number");
private IntWritable outputv = new IntWritable(1);
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
// StringTokenizer str=new StringTokenizer(value.toString(),",");
String filename = ((FileSplit) context.getInputSplit()).getPath().getName();
if (filename.startsWith("black")) {
outputk.set(value.toString());
outputv.set(0);
context.write(outputk, outputv);
} else {
StringTokenizer str = new StringTokenizer(value.toString(), ",");
while (str.hasMoreTokens()) {
String item = str.nextToken();
outputk.set(item);
context.write(outputk, outputv);
}
}
}
static class FileTwoReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
boolean isBlack = false;
for (IntWritable val : values) {
if (val.get() == 0) {
isBlack = true;
break;
}
sum += val.get();
}
if (!isBlack) {
context.write(key, new IntWritable(sum));
}
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration configuration = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(configuration, args);
String[] remainingArgs = optionParser.getRemainingArgs();
Job job = Job.getInstance(configuration, "zrf");
job.setJarByClass(FileTwo.class);
job.setMapperClass(FileTwoMapper.class);
job.setReducerClass(FileTwoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
List<Path> list = FilePathUtil.readFile(remainingArgs[0]);
for (Path path : list
) {
FileInputFormat.addInputPath(job, path);
}
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
}
3.在hdfs目录/tmp/table/student中存在student.txt文件,按tab分隔,字段名为(学号,姓名,课程号,班级名称),hdfs目录/tmp/table/student_location中存在student_location.txt文件,按tab分隔,字段名为(学号,省份,城市,区名),在Map任务中用student_location.txt文件中的学号过滤student.txt中的学号字段,输出student.txt中的存在交集的记录,输出结果结构按tab分隔后的四个字段为(学号,姓名,课程号,班级名称,省份,城市)。
public class Test {
private static class TokenCountermapper33 extends
Mapper<Object, Text, Text, Text> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private Text word1 = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
if (fileName.startsWith("student")) {
String[] str = value.toString().split(",");
word.set(str[0]);
word1.set("student" + "," + str[1] + "," + str[2] + "," + str[3]);
} else {
String[] str = value.toString().split(",");
word.set(str[0]);
word1.set("locat" + "," + str[1] + "," + str[2] + "," + str[3]);
}
context.write(word, word1);
}
}
private static class IntSumReducer33 extends
Reducer<Text, Text, Text, Text> {
private Text text1 = new Text();
private final static IntWritable one = new IntWritable();
public void reduce(Text key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
Map mp = new HashMap();
int count = 0;
for (Text val : values) {
count++;
if (val.toString().startsWith("student")) {
String[] a = val.toString().split(",");
mp.put("姓名", a[1]);
mp.put("课程号", a[2]);
mp.put("班级名称", a[2]);
} else {
String[] a = val.toString().split(",");
mp.put("省份", a[1]);
mp.put("城市", a[2]);
mp.put("区名", a[3]);
}
}
if (count > 1) {
String str = mp.get("姓名") + "\t" + mp.get("省份") + "\t" + mp.get("城市");
text1.set(str);
context.write(key, text1);
}
}
}
private static class MyPartitions extends Partitioner<Text, Text> {
@Override
public int getPartition(Text key, Text value, int reducesNum) {
// Integer.valueOf(key.toString());
if (Integer.parseInt(key.toString()) % 2 == 0) {
return 0;
} else {
return 1;
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.reduce.tasks", "2");
Job job = Job.getInstance(conf, "Work");
job.setJarByClass(Test.class);
job.setMapperClass(TokenCountermapper33.class);
job.setReducerClass(IntSumReducer33.class);
job.setPartitionerClass(MyPartitions.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path path = new Path(args[0]);
Path path1 = new Path(args[1]);
FileInputFormat.addInputPath(job, path);
FileInputFormat.addInputPath(job, path1);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
4.(已排序好文本文件的分组-流式分组)给定一个本地文本文件finance_record_sorted.txt,共2个字段(工号,报销费用),其中按工号升序排列,并用tab分隔。求对该数据进行按工号字段的分组,
public class Test2 {
public static void main(String[] args) throws IOException {
//1,逐行读取数据
String path= "E:\\ideaxiangmu\\MapReduce\\data\\text1.txt";
FileInputStream fis =new FileInputStream(path);
BufferedReader br =new BufferedReader(new InputStreamReader(fis));
//2,将S作为key,后面的作为集合存储
String line=null;
String first=null;
List<String> list =new ArrayList<>();
while((line = br.readLine()) != null){
String[] values =line.split("\\s+");
if(first==null){
first = values[0];
list.add(values[1]);
}else if(first.equals(values[0])){
list.add(values[1]);
}else{
//4,如果key不一样,则输出上一个key,并将上一个key替换为新的key
System.out.println(first+ " "+list);
list.clear();
first=values[0];
list.add(values[1]);
}
}
//5,输出最后一组key,val
System.out.println(first+ " "+list);
br.close();
fis.close();
}
}