转自: https://plmsmile.github.io/2017/03/13/Spark-PairRDD/
PageRank
PageRank的python版本
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" PageRank算法
author = PuLiming
运行: bin/spark-submit files/pagerank.py data/mllib/pagerank_data.txt 10
"""
from __future__ import print_function
import re
import sys
from operator import add
from pyspark import SparkConf, SparkContext
def compute_contribs(urls, rank):
""" 给urls计算
Args:
urls: 目标url相邻的urls集合
rank: 目标url的当前rank
Returns:
url: 相邻urls中的一个url
rank: 当前url的新的rank
"""
num_urls = len(urls)
for url in urls:
yield (url, rank / num_urls)
def split_url(url_line):
""" 把一行url切分开来
Args:
url_line: 一行ur