import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
import java.util.regex.Pattern;
import scala.Tuple2;
import com.google.common.collect.Iterables;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
/**
* 数据格式如下:
* URL neighbor URL
* URL neighbor URL
* URL neighbor URL
*/
public final class JavaPageRank {
private static final Pattern SPACES = Pattern.compile("\\s+");
/**
* 实现元素的相加,reduceByKey使用 根据key求和
*/
private static class Sum implements Function2<Double, Double, Double> {
@Override
public Double call(Double a, Double b) {
return a + b;
}
}
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
System.exit(1);
}
SparkSession spark = SparkSession
.builder()
.appName("JavaPageRank")
.getOrCreate();
JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();
/** links 是(String ,Iterable<String>)
* 先将数据转为key value形式的RDD,然后根据key进行分组 key是某一个url value是这个key的的所有邻居url组成的集合
*/
JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(
new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) {
String[] parts = SPACES.split(s);
return new Tuple2<>(parts[0], parts[1]);
}
}).distinct().groupByKey().cache();
// ranks的初始形式是(String ,Double)其中double一列初始都是1
JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
@Override
public Double call(Iterable<String> rs) {
return 1.0;
}
});
for (int current = 0; current < Integer.parseInt(args[1]); current++) {
// Calculates URL contributions to the rank of other URLs.
// links 是(String ,Iterable<String>) ranks是(String ,Double) join之后是( String ,(Iterable<String>,Double))
JavaPairRDD<String, Double> contribs = links.join(ranks).values()
.flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
@Override
//对values进行flatMapToPair操作 ,value的格式Tuple2<Iterable<String>, Double>
//迭代元组中第一个元素 然后权重分别设置为第二个元素除以第一个元素的里面url的个数
//组成二元组Tuple2<String, Double> ,因为第一个元素里面有多个url会迭代
// 所以call方法返回多个Tuple2<String, Double> 组成的一个Iterator,
// 最后会进行扁平化,所以flatMapToPair返回了JavaPairRDD<String, Double>
public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
int urlCount = Iterables.size(s._1);
List<Tuple2<String, Double>> results = new ArrayList<>();
for (String n : s._1) {
results.add(new Tuple2<>(n, s._2() / urlCount));
}
return results.iterator();
}
});
//contribs是JavaPairRDD<String, Double> 然后我们进行根据key操作对value求和
//那么得到了key及其自己对应的权重值,然后得到平滑后的每个url对应的权重得到新的ranks
//继续迭代,继续把links跟ranks进行join然后更新权重
ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
@Override
public Double call(Double sum) {
return 0.15 + sum * 0.85;
}
});
}
//进行输出ranks存放了url及其对应的权重
List<Tuple2<String, Double>> output = ranks.collect();
for (Tuple2<?,?> tuple : output) {
System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
}
spark.stop();
}
}
spark实现PageRank
最新推荐文章于 2022-01-05 15:02:30 发布