本文主要是介绍如何使用pagerank算法处理67M的web-Google.txt文件。需要对pagerank有一定的了解。由于网上介绍pagerank的文章比较多,这里就不在重新介绍,推荐两篇比较好的文章
-
java版
-
python版
需求
使用mapreduce实现pagerank算法,计算web-Google.txt文件中的top100的pr值
解决方案
1.编码
maven依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.6</version>
</dependency>
UnitMultiplication.java
package com.daniel.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @Author Daniel
* @Description 第一个mapreduce job——计算Transition Matrix * PR Matrix
**/
public class UnitMultiplication {
public static class TransitionMapper extends Mapper<Object, Text, Text, Text> {
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
/*
输入:1 2,8,9,24
输出:key = 1 value=1/4
*/
String line = value.toString().trim();
String[] fromTo = line.split("\t");
// fromTo至少有2个元素,但是也有一个页面不指向任何其他页面的可能性
if (fromTo.length < 2 || fromTo[1].trim().equals("")) {
return;
}
String from = fromTo[0];
String[] to = fromTo[1].split(",");
for (String cur : to) {
context.write(new Text(from), new Text(cur + "=" + (double) 1 / to.length));
}
}
}
public static class PRMapper extends Mapper<Object, Text, Text, Text> {
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
/*
假设1的值为1/6012
输入: pr.txt -> 1 1/6012 (表示key为1的页面的pr值为1/6012)
输出: key = 1 value = 1/6012
*/
String[] pr = value.toString().trim().split("\t");
context.write(new Text(pr[0]), new Text(pr[1]));
}
}
public static class MultiplicationReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
/*
第一次mapper:
输入: key = 1 (起始页) values = <2=1/4, 7=1/4, 1/6012> (到达的页面 + 起始页的权重)
输出: key = 2 (到达页) value = 1/4*1/6012 (到达页的部分权重)
第二次mapper and reduce:
输入: key = 2 (终止页) values = <1/4*1/6012, 1/9*6012, ...> (到达页权重的总和)
输出: key = 2 value = sum
*/
List<String> transitionCells = new ArrayList<String>();
double prCell = .0;
// 计算pr
for (Text value : values) {
if (value.toString().contains("=")) {
transitionCells.add(value.toString().trim());
} else {
prCell = Double.parseDouble(value.toString().trim());
}
}
// 相乘,写出
for (String cell : transitionCells) {
String outputKey = cell.split("=")[0];
double relation = Double.parseDouble(cell.split("=")[1]);
String outputValue = String.valueOf(relation * prCell);
context.write(new Text(outputKey), new Text(outputValue));
}
}
}
public void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(UnitMultiplication.class);
ChainMapper.addMapper(job, TransitionMapper.class, Object.class, Text.class, Text.class, Text.class, conf);
ChainMapper.addMapper(job, PRMapper.class, Object.class, Text.class, Text.class, Text.class, conf);
job.setReducerClass(MultiplicationReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 为两个mapper设置不同的路径
// args[0] =transition.txt
// args[1] =pr.txt
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, TransitionMapper.class);
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, PRMapper.class);
// 输出文件
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.waitForCompletion(true);
}
}
UnitSum.java
package com.daniel.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.text.DecimalFormat;
/**
* @Author Daniel
* @Description 第二个mapreduce job——对第一次的结果进行迭代
**/
public class UnitSum {
public static class PassMapper extends Mapper<Object, Text, Text, DoubleWritable> {
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
/*
从HDFS读取第一次的结果,\t是写入HDFS的默认分隔符,mapper主要还是原样输出
输入:key subPR(每一个权重)
2 1/4*1/6012
输出:key = 2 value = 1/4*1/6012
*/
String[] pageSubRank = value.toString().trim().split("\t");
double subRank = Double.parseDouble(pageSubRank[1]);
context.write(new Text(pageSubRank[0]), new DoubleWritable(subRank));
}
}
public static class SumReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
/*
Reduce对map的结果进行聚合
输入:key = 2 (到达页) values = <1/4*1/6012, 1/9*6012, ...> (所有mapper的集合即到达页的总权重)
输出:key = 2 value = sum
*/
double total = 0;
for (DoubleWritable value : values) {
total += value.get();
}
// 保留5位小数
DecimalFormat df = new DecimalFormat("#.00000");
total = Double.valueOf(df.format(total));
context.write(key, new DoubleWritable(total));
}
}
public void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(UnitSum.class);
job.setMapperClass(PassMapper.class);
job.setReducerClass(SumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
// 为两个mapper设置不同的路径
// args[0] =subPR(第一个mr输出,第二个mr的输入)
// args[1] =pr(第二个mr的输出,第一个mr的第二个mapper的输入)
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Driver.java
package com.daniel.mr;
/**
* @Author Daniel
* @Description 启动类
* 进行 Transition Matrix * PR Matrix 的迭代,并将结果写到硬盘上
**/
public class Driver {
public static void main(String[] args) throws Exception {
UnitMultiplication multiplication = new UnitMultiplication();
UnitSum sum = new UnitSum();
// transition.txt
String transitionMatrix = args[0]; // dir where transition.txt resides
// pr.txt(也是第二个mr的输出)
String prMatrix = args[1];
// subPR(第一个mr的输出)
String subPageRank = args[2];
// 迭代次数
int count = Integer.parseInt(args[3]);
for (int i = 0; i < count; i++) {
// 将上面的三个值传递给第一个mr,并记录每一次迭代的下标
String[] args1 = {transitionMatrix, prMatrix + i, subPageRank + i};
multiplication.main(args1);
// 将第一个mr的输出即subPR传递给第二个mr,并且每一次会输出一个pr i文件夹(pr0,pr1,pr2......),最终结果是最后一次迭代的结果,即prN
String[] args2 = {subPageRank + i, prMatrix + (i + 1)};
sum.main(args2);
}
}
}
FilterFile.java
package com.daniel.mr;
import java.io.*;
import java.math.BigDecimal;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* @Author Daniel
* @Description 过滤并取出topN的值,这里我取前100
**/
public class FilterFile {
public static void main(String[] args) throws IOException {
int topN = 100;
InputStreamReader inr = new InputStreamReader(new FileInputStream("src/main/java/com/daniel/mr/part-r-00000"));
BufferedReader bf = new BufferedReader(inr);
String fileName;
// 按行读取字符串
Map<String, String> map = new HashMap<>();
while ((fileName = bf.readLine()) != null) {
String[] split = fileName.split("\t");
if (!"0.0".equals(split[1])) {
BigDecimal db = new BigDecimal(split[1]);
String ii = db.toPlainString();
// FilterFile.saveAsFileWriter(split[0] + "\t" + ii + "\r\n", "src/main/java/pagerank/all_result.txt", true);//将所有值保存为文件
map.put(split[0], ii);
}
}
// 使用stream对map进行排序
LinkedHashMap<String, String> linkedMap = new LinkedHashMap<>();
map.entrySet().stream().sorted(Map.Entry.comparingByValue())
.forEachOrdered(entry -> linkedMap.put(entry.getKey(), entry.getValue()));
int count = 0;
for (String key : linkedMap.keySet()) {
count++;
String values = linkedMap.get(key);
System.out.println(key + "\t" + values);
if (count == topN)
break;
}
}
public static void saveAsFileWriter(String content, String fileName, boolean append) {
FileWriter fwriter = null;
try {
fwriter = new FileWriter(fileName, append);
fwriter.write(content);
} catch (IOException ex) {
ex.printStackTrace();
} finally {
try {
fwriter.flush();
fwriter.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
}
将代表打成jar包,上传至Linux
2.准备输入文件
由于web-Google.txt文件比较大,这里只贴前面几行
web-Google.txt
0 11342
0 824020
0 867923
0 891835
11342 0
11342 27469
11342 38716
11342 309564
11342 322178
11342 387543
这里借助hive来生成初始的transition.txt与pr.txt
CREATE TABLE WEB_GOOGLE(key string,value string) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
LOAD DATA LOCAL INPATH "/home/hadoop/hive_data/web-Google.txt" INTO TABLE web_google;
SELECT * FROM WEB_GOOGLE LIMIT 10;
INSERT OVERWRITE LOCAL DIRECTORY'/home/hadoop/hive_data/pagerank/transition'
ROW FORMAT DELIMITED FIELDS TERMINATED BY'\t'
SELECT KEY ,CONCAT_WS(',',COLLECT_SET(VALUE)) FROM WEB_GOOGLE GROUP BY KEY;
INSERT OVERWRITE LOCAL DIRECTORY'/home/hadoop/hive_data/pagerank/pr'
ROW FORMAT DELIMITED FIELDS TERMINATED BY'\t'
SELECT DISTINCT KEY,'0.0000013523' FROM WEB_GOOGLE;
这里的0.0000013523就是1/570000,这个值可以根据实际情况来定,我这里是通过计算得出的第一列与第二列的交集的数约等于570000
上传至hdfs,注意这里是放在了pagerank0中,代表第一次迭代的输入数据
mv /home/hadoop/hive_data/pagerank/pr/000000_0 /home/hadoop/hive_data/pagerank/pr.txt
mv /home/hadoop/hive_data/pagerank/transition/000000_0 /home/hadoop/hive_data/pagerank/transition.txt
hdfs dfs -mkdir -p /user/hadoop/pagerank/data/transition
hdfs dfs -mkdir -p /user/hadoop/pagerank/data/pagerank0
hdfs dfs -put /home/hadoop/hive_data/pagerank/transition.txt /user/hadoop/pagerank/data/transition
hdfs dfs -put /home/hadoop/hive_data/pagerank/pr.txt /user/hadoop/pagerank/data/pagerank0
3.启动MapReduce
hadoop jar /home/hadoop/jars/pr.jar com.daniel.mr.Driver /user/hadoop/pagerank/data/transition/ /user/hadoop/pagerank/data/pagerank /user/hadoop/pagerank/output 50
迭代次数越多,结果越精确,这里选择迭代50次。启动后,根据机器的性能花费的时间也会不同。我这里使用的集群的内存为5G,每次迭代是2-3分钟,大概一共是花了2个小时
4.得到结果
结果
这里只需要拿到最后一次迭代的值
hdfs dfs -get /user/hadoop/pagerank/data/pagerank50/part-r-00000
然后将这个结果在FilterFile.java里面跑一下过滤一下即可
top100的结果(并列的较多)
409459 0.000010
218710 0.000010
535111 0.000010
676746 0.000010
535114 0.000010
422429 0.000010
423754 0.000010
740160 0.000010
688723 0.000010
537768 0.000010
914321 0.000010
754798 0.000010
44894 0.000010
591285 0.000010
247295 0.000010
765455 0.000010
2314 0.000010
196848 0.000010
549754 0.000010
29278 0.000010
800324 0.000010
640795 0.000010
459730 0.000010
459733 0.000010
791758 0.000010
183503 0.000010
838902 0.000010
55580 0.000010
490572 0.000010
766774 0.000010
446413 0.000010
574736 0.000010
753463 0.000010
261553 0.000010
561437 0.000010
262887 0.000010
30233 0.000010
524457 0.000010
662101 0.000010
881796 0.000010
43533 0.000010
855460 0.000010
687443 0.000010
878103 0.000010
206774 0.000010
93870 0.000010
801686 0.000010
742783 0.000010
824327 0.000010
880410 0.000010
674131 0.000010
491889 0.000010
602486 0.000010
789471 0.000010
398281 0.000010
867457 0.000010
730780 0.000010
298832 0.000010
662133 0.000010
410418 0.000010
687450 0.000010
229404 0.000010
842162 0.000010
913048 0.000010
914377 0.000010
284246 0.000010
867441 0.000010
476259 0.000010
866116 0.000010
477561 0.000010
878128 0.000010
447702 0.000010
147889 0.000010
688747 0.000010
447705 0.000010
91228 0.000010
913054 0.000010
753403 0.000010
634939 0.000010
160850 0.000010
409462 0.000010
837604 0.000010
688763 0.000010
229416 0.000010
488207 0.000010
1049 0.000010
121288 0.000010
453572 0.000010
694575 0.000010
694578 0.000010
139406 0.000010
441587 0.000010
579577 0.000010
851980 0.000010
525708 0.000010
708025 0.000010
746305 0.000010
875951 0.000010
313269 0.000010
610980 0.000010