所有的实例都在本地进行,启动使用上篇文章的第三种方式
1.电影评分的平均值(所用文件rating.json)
原始数据:{“movie”:”1193”,”rate”:”5”,”timeStamp”:”978300760”,”uid”:”1”}
结果显示:1000 3
1002 4
1003 2
1004 2
1005 2
思路:利用JSON转换工具将数据封装为对象,方便去使用,在Map阶段,将movie和rate作为key、value值,在Reduce阶段将movie和平均评分作为key、value值
(1)封装代码块
public class MovieBean {
private String movie;
private int rate;
private String timeStamp;
private String uid;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return "MovieBean [movie=" + movie + ", rate=" + rate + ", timeStamp=" + timeStamp + ", uid=" + uid + "]";
}
(2)具体的map、reduce实现
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;
public class Avg {
public static class MapTask extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
ObjectMapper objectMapper = new ObjectMapper();//使用ObjectMapper类进行json的转换
MovieBean bean = objectMapper.readValue(value.toString(), MovieBean.class);
context.write(new Text(bean.getMovie()), new IntWritable(bean.getRate())); //读每一行数据,将电影和评分找出来
}
}
public static class ReduceTask extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
int count = 0;
for (IntWritable intWritable : values) {
sum += intWritable.get();//进行评分的求和
count++; //进行电影个数的统计
}
context.write(new Text(key), new IntWritable(sum/count));//求出平均评分
}
}
public static void main(String[] args) {
try {
//System.setProperty("HADOOP_USER_NAME", "SIMPLE");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(Avg.class);
// 设置输入输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 判断文件是否存在
File file = new File("d:\\data\\out\\movie");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
// 输入和输出目录
FileInputFormat.addInputPath(job, new Path("D:\\data\\in\\movie\\rating.json"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\movie"));
// 提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? "你很优秀!!!" : "滚去调bug!!");
} catch (Exception e) {
e.printStackTrace();
// TODO: handle exception
}
}
}
2.求两个用户之间的共同好友
原始数据:A:B,C,D,F,E,O
B:A,C,E,K
A和B的共同好友是C和E
结果: 第一个mapreduce:B-C A B-D A
第二个mapreduce:A-B E C A-C D F
思路:1.因为每个用户的共同好友利用mapreduce不好实现,所以反过来求好友的用户
这样就能求出来好友有哪些用户,两两组合起来用户就是用户的共同好友
2.根据第一个结果将结果的Value值两两组合起来就OK
(1)
public class Compile {
public static class MapTask extends Mapper<LongWritable, Text,Text, Text>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] user = value.toString().split(":");//将用户和好友分开
String[] friend = user[1].split(","); //将好友分开
for (String string1 : friend) {
context.write(new Text(string1),new Text(user[0]) );
//key是好友,value是用户
}
}
}
public static class ReduceTask extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
//利用集合将两个用户进行组合,求出共同好友
List<String> userList = new ArrayList<>();
for (Text string : values) {
userList.add(string.toString());
}
Collections.sort(userList); //需要排序是因为避免A-B/B-Akey值不相同的情况,排完序从前往后遍历就OK
for(int i=0;i<userList.size()-1;i++) {
for(int j=i+1;j<userList.size();j++) {
context.write(new Text(userList.get(i)+"-"+userList.get(j)), key);
}
}
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(Compile.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path("d:\\data\\out\\friend"))) {
fs.delete(new Path("d:\\data\\out\\friend"),true);
}
FileInputFormat.addInputPath(job, new Path("e:\\data\\friend.txt"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\friend"));
boolean completion = job.waitForCompletion(true);
System.out.println(completion?"成功":"失败");
}
catch(Exception e) {
}
}
(2)将每两个用户的好友求出来,资料是friend
class Compile2 {
public static class MapTask extends Mapper<LongWritable, Text,Text, Text>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] split = value.toString().split("\t");//以tab分隔,将两个用户的共同好友传给reduce
context.write(new Text(split[0]), new Text(split[1]));
}
public static class ReduceTask extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String friends="";
for (Text text : values) {
friends += text+" ";
}
context.write(new Text(key),new Text(friends));//将共同好友求出
}
}
}
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(Compile2.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path("d:\\data\\out\\friend1"))) {
fs.delete(new Path("d:\\data\\out\\friend1"),true);
}
FileInputFormat.addInputPath(job, new Path("d:\\data\\out\\friend\\part-r-00000"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\friend1"));
boolean completion = job.waitForCompletion(true);
System.out.println(completion?"成功":"失败");
}
catch(Exception e) {
}
}
3.求出每个网站的上行流量、下行流量以及流量总和(自己定义Hadoop的序列化类)资料是DATA
原始数据
15639120688 http://v.baidu.com/movie 3936 12058
13905256439 http://movie.youku.com 10132 538
结果
blog.csdn.net FlowBean [up=239908231, down=238717280, sum=478625511]
image.baidu.com FlowBean [up=118511778, down=117759776, sum=236271554]
思路:将同一个网址作为key,流量作为value,当计算他们的流量总和,因为value涉及三个数据,不好管理,所以封装起来。
(1)`import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowBean implements Writable { //实现这个接口去增加hadoop的序列化类型
private long up;
private long down;
private long sum;
public void set(long up, long down) {
this.up = up;
this.down = down;
this.sum = up+down;
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public long getSum() {
return sum;
}
public void setSum(long sum) {
this.sum = sum;
}
@Override
public String toString() {
return “FlowBean [up=” + up + “, down=” + down + “, sum=” + sum + “]”;
}
**@Override //序列化与反序列化
public void readFields(DataInput in) throws IOException {
up = in.readLong();
down = in.readLong();
sum = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}`**
(2)MapReduce的实现
public class FlowMR {
public static class MapTask extends Mapper<LongWritable, Text, Text, FlowBean> {
public String reg(String url) {//为了实现截取中间的网址
Pattern pattern = Pattern.compile("(\\w+\\.)?(\\w+\\.){1}\\w+");//传入正则表达式
Matcher matcher = pattern.matcher(url); //匹配url
while(matcher.find()){//找到匹配的话生成新的url
String newUrl = matcher.group();
return newUrl;
}
return null;
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
throws IOException, InterruptedException {
try {
String[] split = value.toString().split("\t")[1].split(" ");
long up = Long.parseLong(split[1]);
long down = Long.parseLong(split[2]);
String url = reg(split[0]);
FlowBean fb = new FlowBean();
fb.set(up, down);
context.write(new Text(url), fb);//将每一行的url和计算好的流量传给reduce
} catch (Exception e) {
}
// TODO: handle exception
}
}
public static class ReduceTask extends Reducer<Text, FlowBean, Text, FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values,
Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException {
long up = 0;
long down = 0;
FlowBean fb = new FlowBean();
for (FlowBean flowBean : values) {
up += flowBean.getUp();
down += flowBean.getDown();//计算总的上行和下行流量
}
fb.set(up, down);
context.write(key, fb);
}
}
public static void main(String[] args) {
try {
//System.setProperty("HADOOP_USER_NAME", "SIMPLE");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(FlowMR.class);
// 设置输入输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
jo b.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
// 判断文件是否存在
File file = new File("d:\\data\\out\\http");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
// 输入和输出目录
FileInputFormat.addInputPath(job, new Path("E:/data/DATA.txt"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\http"));
// 提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? "你很优秀!!!" : "滚去调bug!!");
} catch (Exception e) {
e.printStackTrace();
// TODO: handle exception
}
}
}
4.实现,每个关键词后面,显示所有包含关键词的文件的集合(获取每个词的文件名称)资料是index里面
初始数据:hello hello java c vb c#
hi xiaoming
hello honghong
结果数据:第一个MapReduce: am-b.txt 1
c#-a.txt 1
第二个MapReduce:c a.txt 1, b.txt 1
c# a.txt 1, b.txt 1
思路:首先将每一个文件中的词统计出来,以及在每个文件中的个数
第二个MapReduce将相同词的文件合并起来
(1)
public class CreateUndexOne {
// hello hello hadoop --------> hello-a.txt 1
public static class MapTask extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
**FileSplit fileSplit = (FileSplit)context.getInputSplit();
String name = fileSplit.getPath().getName();**//可以获取关键词所在文件的名称
String[] split = value.toString().split(" ");
for (String string : split) {
context.write(new Text(string + "-" +name), new IntWritable(1));//将每个文件的名称和其个数传给Reduce
}
}
}
public static class ReduceTask extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable intWritable : values) { //统计每个关键词的个数
count++;
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) {
try {
//System.setProperty("HADOOP_USER_NAME", "SIMPLE");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(CreateUndexOne.class);
// 设置输入输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 判断文件是否存在
File file = new File("d:\\data\\out\\indexOne");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
// 输入和输出目录
FileInputFormat.addInputPath(job, new Path("D:\\data\\in\\index"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\indexOne"));
// 提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? "你很优秀!!!" : "滚去调bug!!");
} catch (Exception e) {
e.printStackTrace();
// TODO: handle exception
}
}
}
(2)
public class CreateUndexTwo {
public static class MapTask extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] split = value.toString().split("-");
context.write(new Text(split[0]), new Text(split[1])); //将关键词和所属文件分开方便后边的文件合并
}
}
public static class ReduceTask extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
StringBuilder sb =new StringBuilder();
boolean flag=true;
for (Text text : values) {//实现最后一个单词后面没有逗号
if(flag) {
sb.append(text.toString());
flag=false;
}
else {
sb.append(",");
sb.append(text.toString());
}
}
context.write(key, new Text(sb.toString())); //实现文件的多个合并
}
}
public static void main(String[] args) {
try {
//System.setProperty("HADOOP_USER_NAME", "SIMPLE");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置map和reduce,以及提交的jar
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
job.setJarByClass(CreateUndexTwo.class);
// 设置输入输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 判断文件是否存在
File file = new File("d:\\data\\out\\indexTwo\\");
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
// 输入和输出目录
FileInputFormat.addInputPath(job, new Path("d:\\data\\out\\indexOne"));
FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\indexTwo\\"));
// 提交任务
boolean completion = job.waitForCompletion(true);
System.out.println(completion ? "你很优秀!!!" : "滚去调bug!!");
} catch (Exception e) {
e.printStackTrace();
// TODO: handle exception
}
}
}
`