上一篇博客主要写了如何用Hive操作日志,这一篇主要通过MR程序来进行处理
由于在win中操作,更改目录麻烦,所以本次MR操作在虚拟机的eclipse中
环境准备
在虚拟机中就不使用Maven工程,需要将本地的包都导入到项目中
找到你下载的hadoop,将share中的以下包内容导入到你的项目中
进入所选的目录,将lib中的所有包,以及框中的包导入(四个目录同理)
导入后,可以开始编写MR程序了
编写程序
以下IP是本地Hadoop环境的ip,如需使用以下代码,自行更改ip和文件路径。运行过程中,请确保HDFS中输出文件不存在。第二次运行程序,需要删除第一次运行的结果。
(1)查询总条数
package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-27 21:46
* @version: 1.0.0
**/
public class MRCountAll {
public static Integer i = 0;
public static boolean flag = true;
public static class CountAllMap extends Mapper<Object, Text, Text, Text> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
i++;
}
}
public static void runcount(String Inputpath, String Outpath) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = null;
try {
job = Job.getInstance(conf, "count");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
job.setJarByClass(MRCountAll.class);
job.setMapperClass(CountAllMap.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
try {
FileInputFormat.addInputPath(job, new Path(Inputpath));
} catch (IllegalArgumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
FileOutputFormat.setOutputPath(job, new Path(Outpath));
try {
job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
runcount("/sogou.500w.utf8", "/sogou/data/CountAll");
System.out.println("总条数: " + i);
}
}
(2)非空查询条数
package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-27 21:49
* @version: 1.0.0
**/
public class CountNotNull {
public static String Str = "";
public static int i = 0;
public static boolean flag = true;
public static class wyMap extends Mapper<Object, Text, Text, IntWritable> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String[] values = value.toString().split("\t");
if (!values[2].equals(null) && values[2] != "") {
context.write(new Text(values[1]), new IntWritable(1));
i++;
}
}
}
public static void run(String inputPath, String outputPath) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = null;
try {
job = Job.getInstance(conf, "countnotnull");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
assert job != null;
job.setJarByClass(CountNotNull.class);
job.setMapperClass(wyMap.class);
//job.setReducerClass(wyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
try {
FileInputFormat.addInputPath(job, new Path(inputPath));
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
run("hdfs://192.168.60.1:9000/sogou.500w.utf8", "hdfs://192.168.60.1:9000/sogou/data/CountNotNull");
// System.out.println("非空条数: " + i);
}
}
(3)无重复总条数
package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:26
* @version: 1.0.0
**/
public class CountNotRepeat {
public static int i = 0;
public static class NotRepeatMap extends Mapper<Object , Text , Text, Text>{
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) throws IOException, InterruptedException {
String text = value.toString();
String[] values = text.split("\t");
String time = values[0];
String uid = values[1];
String name = values[2];
String url = values[5];
context.write(new Text(time+uid+name+url), new Text("1"));
}
}
public static class NotRepeatReduc extends Reducer<Text , IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException, InterruptedException {
i++;
context.write(new Text(key.toString()),new IntWritable(i));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = null;
try {
job = Job.getInstance(conf, "countnotnull");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
assert job != null;
job.setJarByClass(CountNotRepeat.class);
job.setMapperClass(NotRepeatMap.class);
job.setReducerClass(NotRepeatReduc.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
try {
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountNotRepeat"));
job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("无重复总条数为: " + i);
}
}
(4)独立UID总数
package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:37
* @version: 1.0.0
**/
public class CountNotMoreUid {
public static int i = 0;
public static class UidMap extends Mapper<Object , Text , Text, Text>{
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) throws IOException, InterruptedException {
String text = value.toString();
String[] values = text.split("\t");
String uid = values[1];
context.write(new Text(uid), new Text("1"));
}
}
public static class UidReduc extends Reducer<Text , IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException, InterruptedException {
i++;
context.write(new Text(key.toString()),new IntWritable(i));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = null;
try {
job = Job.getInstance(conf, "countnotnull");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
assert job != null;
job.setJarByClass(CountNotNull.class);
job.setMapperClass(UidMap.class);
job.setReducerClass(UidReduc.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
try {
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountNotMoreUid"));
job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("独立UID条数: " + i);
}
}
以下程序皆可运行,不再贴图
(5)查询频度排名(频度最高的前50词)
package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.TreeMap;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:41
* @version: 1.0.0
**/
public class CountTop50 {
public static class TopMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
Text text =new Text();
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String[] line= value.toString().split("\t");
String keys = line[2];
text.set(keys);
context.write(text,new LongWritable(1));
}
}
public static class TopReducer extends Reducer< Text,LongWritable, Text, LongWritable>{
Text text = new Text();
TreeMap<Integer,String > map = new TreeMap<Integer,String>();
@Override
protected void reduce(Text key, Iterable<LongWritable> value, Context context)
throws IOException, InterruptedException {
int sum=0;//key出现次数
for (LongWritable ltext : value) {
sum+=ltext.get();
}
map.put(sum,key.toString());
//去前50条数据
if(map.size()>50){
map.remove(map.firstKey());
}
}
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
for(Integer count:map.keySet()){
context.write(new Text(map.get(count)), new LongWritable(count));
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = Job.getInstance(conf, "count");
job.setJarByClass(CountTop50.class);
job.setJobName("Five");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapperClass(TopMapper.class);
job.setReducerClass(TopReducer.class);
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountTop50"));
job.waitForCompletion(true);
}
}
(6)查询次数大于2次的用户总数
package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:53
* @version: 1.0.0
**/
public class CountQueriesGreater2 {
public static int total = 0;
public static class MyMaper extends Mapper<Object, Text, Text, IntWritable> {
protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String[] str = value.toString().split("\t");
Text word;
IntWritable one = new IntWritable(1);
word = new Text(str[1]);
context.write(word, one);
}
}
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text arg0, Iterable<IntWritable> arg1,
Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {
// arg0是一个单词 arg1是对应的次数
int sum = 0;
for (IntWritable i : arg1) {
sum += i.get();
}
if(sum>2){
total=total+1;
}
//arg2.write(arg0, new IntWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
// 1.实例化一个Job
Job job = Job.getInstance(conf, "six");
// 2.设置mapper类
job.setMapperClass(MyMaper.class);
// 3.设置Combiner类 不是必须的
// job.setCombinerClass(MyReducer.class);
// 4.设置Reducer类
job.setReducerClass(MyReducer.class);
// 5.设置输出key的数据类型
job.setOutputKeyClass(Text.class);
// 6.设置输出value的数据类型
job.setOutputValueClass(IntWritable.class);
// 设置通过哪个类查找job的Jar包
job.setJarByClass(CountQueriesGreater2.class);
// 7.设置输入路径
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
// 8.设置输出路径
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountQueriesGreater2"));
// 9.执行该作业
job.waitForCompletion(true);
System.out.println("查询次数大于2次的用户总数:" + total + "条");
}
}
(7)查询次数大于2次的用户占比
package MR;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:49
* @version: 1.0.0
**/
public class CountQueriesGreaterPro {
/*
* InputFormat ->FileInputFormat(子类)-> TextInputFormat(子类)
* 程序当中默认调用的就是TextInputFormat 1.验证数据路径是否合法 2.TextInputFormat默认读取数据,就是一行一行读取的
*
* 开始执行map,一个map就是一个task(是一个Java进程,运行在JVM上的)
* map执行完毕后,会执行combiner(可选项),对一个map中的重复单词进行合并,value是一个map中出现的相同单词次数。
* shuffle(partitioner分区,进行不同map的值并按key排序)
* Reducer接收所有map数据,并将具有相同key的value值合并
*
* OutputFormat ->FileOutputFormat(子类)->TextOutputFormat(子类) 1.验证数据路径是否合法
* 2.TextOutputFormat写入文件格式是 key + "\t" + value + "\n"
*/
// Text是一个能够写入文件的Java String数据类型
// IntWritable是能够写入文件的int数据类型
// 头两个参数表示的是输入数据key和value的数据类型
// 后两个数据表示的就是输出数据key和value的数据类型
public static int total1 = 0;
public static int total2 = 0;
public static class MyMaper extends Mapper<Object, Text, Text, IntWritable> {
@Override
// key是行地址
// value是一行字符串
protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
total2++;
String[] str = value.toString().split("\t");
Text word;
IntWritable one = new IntWritable(1);
word = new Text(str[1]);
context.write(word, one);
// 执行完毕后就是一个单词 对应一个value(1)
}
}
// 头两个参数表示的是输入数据key和value的数据类型
// 后两个数据表示的就是输出数据key和value的数据类型
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text arg0, Iterable<IntWritable> arg1,
Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {
// arg0是一个单词 arg1是对应的次数
int sum = 0;
for (IntWritable i : arg1) {
sum += i.get();
}
if(sum>2){
total1++;
}
arg2.write(arg0, new IntWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.out.println("seven begin");
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
// 1.实例化一个Job
Job job = Job.getInstance(conf, "seven");
// 2.设置mapper类
job.setMapperClass(MyMaper.class);
// 3.设置Combiner类 不是必须的
// job.setCombinerClass(MyReducer.class);
// 4.设置Reducer类
job.setReducerClass(MyReducer.class);
// 5.设置输出key的数据类型
job.setOutputKeyClass(Text.class);
// 6.设置输出value的数据类型
job.setOutputValueClass(IntWritable.class);
// 设置通过哪个类查找job的Jar包
job.setJarByClass(CountQueriesGreaterPro.class);
// 7.设置输入路径
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
// 8.设置输出路径
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountQueriesGreaterPro"));
// 9.执行该作业
job.waitForCompletion(true);
System.out.println("total1="+total1+"\ttotal2="+total2);
float percentage = (float)total1/(float)total2;
System.out.println("查询次数大于2次的用户占比为:" + percentage*100+"%");
System.out.println("over");
}
}
(8)Rank在10以内的点击次数占比
package MR;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:52
* @version: 1.0.0
**/
public class CountRank {
public static int sum1 = 0;
public static int sum2 = 0;
public static class MyMapper extends Mapper<Object, Text, Text, Text> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
sum2++;
String[] str = value.toString().split("\t");
int rank = Integer.parseInt(str[3]);
if(rank<11)
{
sum1=sum1+1;
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = Job.getInstance(conf, "eight");
job.setMapperClass(MyMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setJarByClass(CountRank.class);
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountRank"));
job.waitForCompletion(true);
System.out.println("sum1="+sum1+"\tsum2="+sum2);
float percentage = (float)sum1/(float)sum2;
System.out.println("Rank在10以内的点击次数占比:" +percentage*100+"%");
}
}
(9)直接输入URL查询的比例
package MR;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:55
* @version: 1.0.0
**/
public class CountURL {
public static int sum1 = 0;
public static int sum2 = 0;
public static class MyMapper extends Mapper<Object, Text, Text, Text> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] str = value.toString().split("\t");
Pattern p = Pattern.compile("www");
Matcher matcher = p.matcher(str[2]);
matcher.find();
try {
if(matcher.group()!=null)
sum1++;
sum2++;
} catch (Exception e) {
sum2++;
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = Job.getInstance(conf, "nine");
job.setMapperClass(MyMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setJarByClass(CountURL.class);
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountURL"));
job.waitForCompletion(true);
System.out.println("sum1="+sum1+"\tsum2="+sum2);
float percentage = (float)sum1/(float)sum2;
System.out.println("直接用url'%www%'查询的用户占比:" +percentage*100+"%");
}
}
(10)查询搜索过”仙剑奇侠传“的uid,并且次数大于3
package MR;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @Title:
* @author: 陈宏松
* @create: 2018-11-28 11:58
* @version: 1.0.0
**/
public class CountUidGreater3 {
public static String Str="";
public static int i=0;
public static class Map extends Mapper<Object, Text, Text, IntWritable>{
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String []values=value.toString().split("\t");
String pattern="仙剑奇侠传";
if(values[2].equals(pattern)){
context.write(new Text(values[1]), new IntWritable(1));
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum=0;
for(IntWritable v:value){
sum=sum+v.get();
}
if(sum>3){
Str=Str+key.toString()+"\n";
i++;
}
}
}
public static void main(String[] args) {
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
Job job = null;
try {
job = Job.getInstance(conf, "count");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
job.setJarByClass(CountUidGreater3.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
try {
FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
} catch (IllegalArgumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountUidGreater3"));
job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("i: "+i);
System.out.println(Str);
}
}
以上结果均可以在Hadoop的web界面中查询