/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.spark.{SparkConf, SparkContext}


/**

  * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize]

  */

object BroadcastTest {

  def main(args: Array[String]) {


    val bcName = if (args.length > 2) args(2) else "Http"

    val blockSize = if (args.length > 3) args(3) else "4096"


    val sparkConf = new SparkConf().setAppName("Broadcast Test")

      .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroaddcastFactory")

      .set("spark.broadcast.blockSize", blockSize)

    val sc = new SparkContext(sparkConf)


    val slices = if (args.length > 0) args(0).toInt else 2

    val num = if (args.length > 1) args(1).toInt else 1000000


    val arr1 = (0 until num).toArray


    for (i <- 0 until 3) {

      println("Iteration " + i)

      println("===========")

      val startTime = System.nanoTime

      val barr1 = sc.broadcast(arr1)

      val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size)

      // Collect the small RDD so we can print the observed sizes locally.

      observedSizes.collect().foreach(i => println(i))

      println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6))

    }


    sc.stop()

  }

}




/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.nio.ByteBuffer


import scala.collection.JavaConversions._

import scala.collection.mutable.ListBuffer

import scala.collection.immutable.Map


import org.apache.cassandra.hadoop.ConfigHelper

import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat

import org.apache.cassandra.hadoop.cql3.CqlConfigHelper

import org.apache.cassandra.hadoop.cql3.CqlOutputFormat

import org.apache.cassandra.utils.ByteBufferUtil

import org.apache.hadoop.mapreduce.Job


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._



/*

  Need to create following keyspace and column family in cassandra before running this example

  Start CQL shell using ./bin/cqlsh and execute following commands

  CREATE KEYSPACE retail WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};

  use retail;

  CREATE TABLE salecount (prod_id text, sale_count int, PRIMARY KEY (prod_id));

  CREATE TABLE ordercf (user_id text,

    time timestamp,

    prod_id text,

    quantity int,

    PRIMARY KEY (user_id, time));

  INSERT INTO ordercf (user_id,

    time,

    prod_id,

    quantity) VALUES  ('bob', 1385983646000, 'iphone', 1);

  INSERT INTO ordercf (user_id,

    time,

    prod_id,

    quantity) VALUES ('tom', 1385983647000, 'samsung', 4);

  INSERT INTO ordercf (user_id,

    time,

    prod_id,

    quantity) VALUES ('dora', 1385983648000, 'nokia', 2);

  INSERT INTO ordercf (user_id,

    time,

    prod_id,

    quantity) VALUES ('charlie', 1385983649000, 'iphone', 2);

*/


/**

 * This example demonstrates how to read and write to cassandra column family created using CQL3

 * using Spark.

 * Parameters : <cassandra_node> <cassandra_port>

 * Usage: ./bin/spark-submit examples.jar \

 *  --class org.apache.spark.examples.CassandraCQLTest localhost 9160

 */

object CassandraCQLTest {


  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("CQLTestApp")


    val sc = new SparkContext(sparkConf)

    val cHost: String = args(0)

    val cPort: String = args(1)

    val KeySpace = "retail"

    val InputColumnFamily = "ordercf"

    val OutputColumnFamily = "salecount"


    val job = new Job()

    job.setInputFormatClass(classOf[CqlPagingInputFormat])

    ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost)

    ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort)

    ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily)

    ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")

    CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3")


    /** CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "user_id='bob'") */


    /** An UPDATE writes one or more columns to a record in a Cassandra column family */

    val query = "UPDATE " + KeySpace + "." + OutputColumnFamily + " SET sale_count = ? "

    CqlConfigHelper.setOutputCql(job.getConfiguration(), query)


    job.setOutputFormatClass(classOf[CqlOutputFormat])

    ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KeySpace, OutputColumnFamily)

    ConfigHelper.setOutputInitialAddress(job.getConfiguration(), cHost)

    ConfigHelper.setOutputRpcPort(job.getConfiguration(), cPort)

    ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")


    val casRdd = sc.newAPIHadoopRDD(job.getConfiguration(),

      classOf[CqlPagingInputFormat],

      classOf[java.util.Map[String,ByteBuffer]],

      classOf[java.util.Map[String,ByteBuffer]])


    println("Count: " + casRdd.count)

    val productSaleRDD = casRdd.map {

      case (key, value) => {

        (ByteBufferUtil.string(value.get("prod_id")), ByteBufferUtil.toInt(value.get("quantity")))

      }

    }

    val aggregatedRDD = productSaleRDD.reduceByKey(_ + _)

    aggregatedRDD.collect().foreach {

      case (productId, saleCount) => println(productId + ":" + saleCount)

    }


    val casoutputCF  = aggregatedRDD.map {

      case (productId, saleCount) => {

        val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId))

        val outKey: java.util.Map[String, ByteBuffer] = outColFamKey

        var outColFamVal = new ListBuffer[ByteBuffer]

        outColFamVal += ByteBufferUtil.bytes(saleCount)

        val outVal: java.util.List[ByteBuffer] = outColFamVal

       (outKey, outVal)

      }

    }


    casoutputCF.saveAsNewAPIHadoopFile(

        KeySpace,

        classOf[java.util.Map[String, ByteBuffer]],

        classOf[java.util.List[ByteBuffer]],

        classOf[CqlOutputFormat],

        job.getConfiguration()

      )


    sc.stop()

  }

}




/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.nio.ByteBuffer

import java.util.SortedMap


import scala.collection.JavaConversions._


import org.apache.cassandra.db.IColumn

import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat

import org.apache.cassandra.hadoop.ConfigHelper

import org.apache.cassandra.hadoop.ColumnFamilyInputFormat

import org.apache.cassandra.thrift._

import org.apache.cassandra.utils.ByteBufferUtil

import org.apache.hadoop.mapreduce.Job


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/*

 * This example demonstrates using Spark with Cassandra with the New Hadoop API and Cassandra

 * support for Hadoop.

 *

 * To run this example, run this file with the following command params -

 * <cassandra_node> <cassandra_port>

 *

 * So if you want to run this on localhost this will be,

 * localhost 9160

 *

 * The example makes some assumptions:

 * 1. You have already created a keyspace called casDemo and it has a column family named Words

 * 2. There are column family has a column named "para" which has test content.

 *

 * You can create the content by running the following script at the bottom of this file with

 * cassandra-cli.

 *

 */

object CassandraTest {


  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("casDemo")

    // Get a SparkContext

    val sc = new SparkContext(sparkConf)


    // Build the job configuration with ConfigHelper provided by Cassandra

    val job = new Job()

    job.setInputFormatClass(classOf[ColumnFamilyInputFormat])


    val host: String = args(1)

    val port: String = args(2)


    ConfigHelper.setInputInitialAddress(job.getConfiguration(), host)

    ConfigHelper.setInputRpcPort(job.getConfiguration(), port)

    ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host)

    ConfigHelper.setOutputRpcPort(job.getConfiguration(), port)

    ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words")

    ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount")


    val predicate = new SlicePredicate()

    val sliceRange = new SliceRange()

    sliceRange.setStart(Array.empty[Byte])

    sliceRange.setFinish(Array.empty[Byte])

    predicate.setSlice_range(sliceRange)

    ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate)


    ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")

    ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")


    // Make a new Hadoop RDD

    val casRdd = sc.newAPIHadoopRDD(

      job.getConfiguration(),

      classOf[ColumnFamilyInputFormat],

      classOf[ByteBuffer],

      classOf[SortedMap[ByteBuffer, IColumn]])


    // Let us first get all the paragraphs from the retrieved rows

    val paraRdd = casRdd.map {

      case (key, value) => {

        ByteBufferUtil.string(value.get(ByteBufferUtil.bytes("para")).value())

      }

    }


    // Lets get the word count in paras

    val counts = paraRdd.flatMap(p => p.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)


    counts.collect().foreach {

      case (word, count) => println(word + ":" + count)

    }


    counts.map {

      case (word, count) => {

        val colWord = new org.apache.cassandra.thrift.Column()

        colWord.setName(ByteBufferUtil.bytes("word"))

        colWord.setValue(ByteBufferUtil.bytes(word))

        colWord.setTimestamp(System.currentTimeMillis)


        val colCount = new org.apache.cassandra.thrift.Column()

        colCount.setName(ByteBufferUtil.bytes("wcount"))

        colCount.setValue(ByteBufferUtil.bytes(count.toLong))

        colCount.setTimestamp(System.currentTimeMillis)


        val outputkey = ByteBufferUtil.bytes(word + "-COUNT-" + System.currentTimeMillis)


        val mutations: java.util.List[Mutation] = new Mutation() :: new Mutation() :: Nil

        mutations.get(0).setColumn_or_supercolumn(new ColumnOrSuperColumn())

        mutations.get(0).column_or_supercolumn.setColumn(colWord)

        mutations.get(1).setColumn_or_supercolumn(new ColumnOrSuperColumn())

        mutations.get(1).column_or_supercolumn.setColumn(colCount)

        (outputkey, mutations)

      }

    }.saveAsNewAPIHadoopFile("casDemo", classOf[ByteBuffer], classOf[List[Mutation]],

      classOf[ColumnFamilyOutputFormat], job.getConfiguration)


    sc.stop()

  }

}


/*

create keyspace casDemo;

use casDemo;


create column family WordCount with comparator = UTF8Type;

update column family WordCount with column_metadata =

  [{column_name: word, validation_class: UTF8Type},

    {column_name: wcount, validation_class: LongType}];


create column family Words with comparator = UTF8Type;

update column family Words with column_metadata =

  [{column_name: book, validation_class: UTF8Type},

    {column_name: para, validation_class: UTF8Type}];


assume Words keys as utf8;


set Words['3musk001']['book'] = 'The Three Musketeers';

set Words['3musk001']['para'] = 'On the first Monday of the month of April, 1625, the market

  town of Meung, in which the author of ROMANCE OF THE ROSE was born, appeared to

 be in as perfect a state of revolution as if the Huguenots had just made

 a second La Rochelle of it. Many citizens, seeing the women flying

 toward the High Street, leaving their children crying at the open doors,

 hastened to don the cuirass, and supporting their somewhat uncertain

 courage with a musket or a partisan, directed their steps toward the

 hostelry of the Jolly Miller, before which was gathered, increasing

 every minute, a compact group, vociferous and full of curiosity.';


set Words['3musk002']['book'] = 'The Three Musketeers';

set Words['3musk002']['para'] = 'In those times panics were common, and few days passed without

  some city or other registering in its archives an event of this kind. There were

  nobles, who made war against each other; there was the king, who made

  war against the cardinal; there was Spain, which made war against the

  king. Then, in addition to these concealed or public, secret or open

  wars, there were robbers, mendicants, Huguenots, wolves, and scoundrels,

  who made war upon everybody. The citizens always took up arms readily

  against thieves, wolves or scoundrels, often against nobles or

  Huguenots, sometimes against the king, but never against cardinal or

  Spain. It resulted, then, from this habit that on the said first Monday

  of April, 1625, the citizens, on hearing the clamor, and seeing neither

  the red-and-yellow standard nor the livery of the Duc de Richelieu,

  rushed toward the hostel of the Jolly Miller. When arrived there, the

  cause of the hubbub was apparent to all';


set Words['3musk003']['book'] = 'The Three Musketeers';

set Words['3musk003']['para'] = 'You ought, I say, then, to husband the means you have, however

  large the sum may be; but you ought also to endeavor to perfect yourself in

  the exercises becoming a gentleman. I will write a letter today to the

  Director of the Royal Academy, and tomorrow he will admit you without

  any expense to yourself. Do not refuse this little service. Our

  best-born and richest gentlemen sometimes solicit it without being able

  to obtain it. You will learn horsemanship, swordsmanship in all its

  branches, and dancing. You will make some desirable acquaintances; and

  from time to time you can call upon me, just to tell me how you are

  getting on, and to say whether I can be of further service to you.';



set Words['thelostworld001']['book'] = 'The Lost World';

set Words['thelostworld001']['para'] = 'She sat with that proud, delicate profile of hers outlined

  against the red curtain.  How beautiful she was!  And yet how aloof!  We had been

  friends, quite good friends; but never could I get beyond the same

  comradeship which I might have established with one of my

  fellow-reporters upon the Gazette,--perfectly frank, perfectly kindly,

  and perfectly unsexual.  My instincts are all against a woman being too

  frank and at her ease with me.  It is no compliment to a man.  Where

  the real sex feeling begins, timidity and distrust are its companions,

  heritage from old wicked days when love and violence went often hand in

  hand.  The bent head, the averted eye, the faltering voice, the wincing

  figure--these, and not the unshrinking gaze and frank reply, are the

  true signals of passion.  Even in my short life I had learned as much

  as that--or had inherited it in that race memory which we call instinct.';


set Words['thelostworld002']['book'] = 'The Lost World';

set Words['thelostworld002']['para'] = 'I always liked McArdle, the crabbed, old, round-backed,

  red-headed news editor, and I rather hoped that he liked me.  Of course, Beaumont was

  the real boss; but he lived in the rarefied atmosphere of some Olympian

  height from which he could distinguish nothing smaller than an

  international crisis or a split in the Cabinet.  Sometimes we saw him

  passing in lonely majesty to his inner sanctum, with his eyes staring

  vaguely and his mind hovering over the Balkans or the Persian Gulf.  He

  was above and beyond us.  But McArdle was his first lieutenant, and it

  was he that we knew.  The old man nodded as I entered the room, and he

  pushed his spectacles far up on his bald forehead.';


*/



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import scala.collection.JavaConversions._


/** Prints out environmental information, sleeps, and then exits. Made to

  * test driver submission in the standalone scheduler. */

object DriverSubmissionTest {

  def main(args: Array[String]) {

    if (args.size < 1) {

      println("Usage: DriverSubmissionTest <seconds-to-sleep>")

      System.exit(0)

    }

    val numSecondsToSleep = args(0).toInt


    val env = System.getenv()

    val properties = System.getProperties()


    println("Environment variables containing SPARK_TEST:")

    env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)


    println("System properties containing spark.test:")

    properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)


    for (i <- 1 until numSecondsToSleep) {

      println(s"Alive for $i out of $numSecondsToSleep seconds")

      Thread.sleep(1000)

    }

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.spark.{SparkConf, SparkContext}


object ExceptionHandlingTest {

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest")

    val sc = new SparkContext(sparkConf)

    sc.parallelize(0 until sc.defaultParallelism).foreach { i =>

      if (math.random > 0.75) {

        throw new Exception("Testing exception handling")

      }

    }


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/**

  * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]

  */

object GroupByTest {

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("GroupBy Test")

    var numMappers = if (args.length > 0) args(0).toInt else 2

    var numKVPairs = if (args.length > 1) args(1).toInt else 1000

    var valSize = if (args.length > 2) args(2).toInt else 1000

    var numReducers = if (args.length > 3) args(3).toInt else numMappers


    val sc = new SparkContext(sparkConf)


    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>

      val ranGen = new Random

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)

      for (i <- 0 until numKVPairs) {

        val byteArr = new Array[Byte](valSize)

        ranGen.nextBytes(byteArr)

        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)

      }

      arr1

    }.cache()

    // Enforce that everything has been calculated and in cache

    pairs1.count()


    println(pairs1.groupByKey(numReducers).count())


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.hadoop.hbase.client.HBaseAdmin

import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}

import org.apache.hadoop.hbase.mapreduce.TableInputFormat


import org.apache.spark._



object HBaseTest {

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("HBaseTest")

    val sc = new SparkContext(sparkConf)

    val conf = HBaseConfiguration.create()

    // Other options for configuring scan behavior are available. More information available at

    // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html

    conf.set(TableInputFormat.INPUT_TABLE, args(0))


    // Initialize hBase table if necessary

    val admin = new HBaseAdmin(conf)

    if (!admin.isTableAvailable(args(0))) {

      val tableDesc = new HTableDescriptor(args(0))

      admin.createTable(tableDesc)

    }


    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],

      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],

      classOf[org.apache.hadoop.hbase.client.Result])


    hBaseRDD.count()


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.spark._



object HdfsTest {


  /** Usage: HdfsTest [file] */

  def main(args: Array[String]) {

    if (args.length < 1) {

      System.err.println("Usage: HdfsTest <file>")

      System.exit(1)

    }

    val sparkConf = new SparkConf().setAppName("HdfsTest")

    val sc = new SparkContext(sparkConf)

    val file = sc.textFile(args(0))

    val mapped = file.map(s => s.length).cache()

    for (iter <- 1 to 10) {

      val start = System.currentTimeMillis()

      for (x <- mapped) { x + 2 }

      val end = System.currentTimeMillis()

      println("Iteration " + iter + " took " + (end-start) + " ms")

    }

    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.commons.math3.linear._


/**

 * Alternating least squares matrix factorization.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.mllib.recommendation.ALS

 */

object LocalALS {


  // Parameters set through command line arguments

  var M = 0 // Number of movies

  var U = 0 // Number of users

  var F = 0 // Number of features

  var ITERATIONS = 0

  val LAMBDA = 0.01 // Regularization coefficient


  def generateR(): RealMatrix = {

    val mh = randomMatrix(M, F)

    val uh = randomMatrix(U, F)

    mh.multiply(uh.transpose())

  }


  def rmse(targetR: RealMatrix, ms: Array[RealVector], us: Array[RealVector]): Double = {

    val r = new Array2DRowRealMatrix(M, U)

    for (i <- 0 until M; j <- 0 until U) {

      r.setEntry(i, j, ms(i).dotProduct(us(j)))

    }

    val diffs = r.subtract(targetR)

    var sumSqs = 0.0

    for (i <- 0 until M; j <- 0 until U) {

      val diff = diffs.getEntry(i, j)

      sumSqs += diff * diff

    }

    math.sqrt(sumSqs / (M.toDouble * U.toDouble))

  }


  def updateMovie(i: Int, m: RealVector, us: Array[RealVector], R: RealMatrix) : RealVector = {

    var XtX: RealMatrix = new Array2DRowRealMatrix(F, F)

    var Xty: RealVector = new ArrayRealVector(F)

    // For each user that rated the movie

    for (j <- 0 until U) {

      val u = us(j)

      // Add u * u^t to XtX

      XtX = XtX.add(u.outerProduct(u))

      // Add u * rating to Xty

      Xty = Xty.add(u.mapMultiply(R.getEntry(i, j)))

    }

    // Add regularization coefficients to diagonal terms

    for (d <- 0 until F) {

      XtX.addToEntry(d, d, LAMBDA * U)

    }

    // Solve it with Cholesky

    new CholeskyDecomposition(XtX).getSolver.solve(Xty)

  }


  def updateUser(j: Int, u: RealVector, ms: Array[RealVector], R: RealMatrix) : RealVector = {

    var XtX: RealMatrix = new Array2DRowRealMatrix(F, F)

    var Xty: RealVector = new ArrayRealVector(F)

    // For each movie that the user rated

    for (i <- 0 until M) {

      val m = ms(i)

      // Add m * m^t to XtX

      XtX = XtX.add(m.outerProduct(m))

      // Add m * rating to Xty

      Xty = Xty.add(m.mapMultiply(R.getEntry(i, j)))

    }

    // Add regularization coefficients to diagonal terms

    for (d <- 0 until F) {

      XtX.addToEntry(d, d, LAMBDA * M)

    }

    // Solve it with Cholesky

    new CholeskyDecomposition(XtX).getSolver.solve(Xty)

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of ALS and is given as an example!

        |Please use the ALS method found in org.apache.spark.mllib.recommendation

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    args match {

      case Array(m, u, f, iters) => {

        M = m.toInt

        U = u.toInt

        F = f.toInt

        ITERATIONS = iters.toInt

      }

      case _ => {

        System.err.println("Usage: LocalALS <M> <U> <F> <iters>")

        System.exit(1)

      }

    }


    showWarning()


    println(s"Running with M=$M, U=$U, F=$F, iters=$ITERATIONS")


    val R = generateR()


    // Initialize m and u randomly

    var ms = Array.fill(M)(randomVector(F))

    var us = Array.fill(U)(randomVector(F))


    // Iteratively update movies then users

    for (iter <- 1 to ITERATIONS) {

      println(s"Iteration $iter:")

      ms = (0 until M).map(i => updateMovie(i, ms(i), us, R)).toArray

      us = (0 until U).map(j => updateUser(j, us(j), ms, R)).toArray

      println("RMSE = " + rmse(R, ms, us))

      println()

    }

  }


  private def randomVector(n: Int): RealVector =

    new ArrayRealVector(Array.fill(n)(math.random))


  private def randomMatrix(rows: Int, cols: Int): RealMatrix =

    new Array2DRowRealMatrix(Array.fill(rows, cols)(math.random))


}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import breeze.linalg.{Vector, DenseVector}


/**

 * Logistic regression based classification.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

 * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.

 */

object LocalFileLR {

  val D = 10   // Numer of dimensions

  val rand = new Random(42)


  case class DataPoint(x: Vector[Double], y: Double)


  def parsePoint(line: String): DataPoint = {

    val nums = line.split(' ').map(_.toDouble)

    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of Logistic Regression and is given as an example!

        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    showWarning()


    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray

    val points = lines.map(parsePoint _)

    val ITERATIONS = args(1).toInt


    // Initialize w to a random value

    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}

    println("Initial w: " + w)


    for (i <- 1 to ITERATIONS) {

      println("On iteration " + i)

      var gradient = DenseVector.zeros[Double](D)

      for (p <- points) {

        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y

        gradient += p.x * scale

      }

      w -= gradient

    }


    println("Final w: " + w)

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import scala.collection.mutable.HashMap

import scala.collection.mutable.HashSet


import breeze.linalg.{Vector, DenseVector, squaredDistance}


import org.apache.spark.SparkContext._


/**

 * K-means clustering.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.mllib.clustering.KMeans

 */

object LocalKMeans {

  val N = 1000

  val R = 1000    // Scaling factor

  val D = 10

  val K = 10

  val convergeDist = 0.001

  val rand = new Random(42)


  def generateData = {

    def generatePoint(i: Int) = {

      DenseVector.fill(D){rand.nextDouble * R}

    }

    Array.tabulate(N)(generatePoint)

  }


  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {

    var index = 0

    var bestIndex = 0

    var closest = Double.PositiveInfinity


    for (i <- 1 to centers.size) {

      val vCurr = centers.get(i).get

      val tempDist = squaredDistance(p, vCurr)

      if (tempDist < closest) {

        closest = tempDist

        bestIndex = i

      }

    }


    bestIndex

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!

        |Please use the KMeans method found in org.apache.spark.mllib.clustering

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    showWarning()


    val data = generateData

    var points = new HashSet[Vector[Double]]

    var kPoints = new HashMap[Int, Vector[Double]]

    var tempDist = 1.0


    while (points.size < K) {

      points.add(data(rand.nextInt(N)))

    }


    val iter = points.iterator

    for (i <- 1 to points.size) {

      kPoints.put(i, iter.next())

    }


    println("Initial centers: " + kPoints)


    while(tempDist > convergeDist) {

      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))


      var mappings = closest.groupBy[Int] (x => x._1)


      var pointStats = mappings.map { pair =>

        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {

          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))

        }

      }


      var newPoints = pointStats.map {mapping =>

        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}


      tempDist = 0.0

      for (mapping <- newPoints) {

        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      }


      for (newP <- newPoints) {

        kPoints.put(newP._1, newP._2)

      }

    }


    println("Final centers: " + kPoints)

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import breeze.linalg.{Vector, DenseVector}


/**

 * Logistic regression based classification.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

 * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.

 */

object LocalLR {

  val N = 10000  // Number of data points

  val D = 10   // Number of dimensions

  val R = 0.7  // Scaling factor

  val ITERATIONS = 5

  val rand = new Random(42)


  case class DataPoint(x: Vector[Double], y: Double)


  def generateData = {

    def generatePoint(i: Int) = {

      val y = if(i % 2 == 0) -1 else 1

      val x = DenseVector.fill(D){rand.nextGaussian + y * R}

      DataPoint(x, y)

    }

    Array.tabulate(N)(generatePoint)

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of Logistic Regression and is given as an example!

        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    showWarning()


    val data = generateData

    // Initialize w to a random value

    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}

    println("Initial w: " + w)


    for (i <- 1 to ITERATIONS) {

      println("On iteration " + i)

      var gradient = DenseVector.zeros[Double](D)

      for (p <- data) {

        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y

        gradient +=  p.x * scale

      }

      w -= gradient

    }


    println("Final w: " + w)

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import scala.math.random


import org.apache.spark._

import org.apache.spark.SparkContext._


object LocalPi {

  def main(args: Array[String]) {

    var count = 0

    for (i <- 1 to 100000) {

      val x = random * 2 - 1

      val y = random * 2 - 1

      if (x*x + y*y < 1) count += 1

    }

    println("Pi is roughly " + 4 * count / 100000.0)

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/**

 * Executes a roll up-style query against Apache logs.

 *  

 * Usage: LogQuery [logFile]

 */

object LogQuery {

  val exampleApacheLogs = List(

    """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://p_w_picpaths.com/2013/Generic.jpg

      | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;

      | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR

      | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR

      | 3.5.30729; Release=ARP)" "UD-1" - "p_w_picpath/jpeg" "whatever" 0.350 "-" - "" 265 923 934 ""

      | 62.24.11.25 p_w_picpaths.com 1358492167 - Whatup""".stripMargin.lines.mkString,

    """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://p_w_picpaths.com/2013/Generic.jpg

      | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;

      | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR

      | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR

      | 3.5.30729; Release=ARP)" "UD-1" - "p_w_picpath/jpeg" "whatever" 0.352 "-" - "" 256 977 988 ""

      | 0 73.23.2.15 p_w_picpaths.com 1358492557 - Whatup""".stripMargin.lines.mkString

  )


  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("Log Query")

    val sc = new SparkContext(sparkConf)


    val dataSet =

      if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs)

    // scalastyle:off

    val apacheLogRegex =

      """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r

    // scalastyle:on

    /** Tracks the total query count and number of aggregate bytes for a particular group. */

    class Stats(val count: Int, val numBytes: Int) extends Serializable {

      def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes)

      override def toString = "bytes=%s\tn=%s".format(numBytes, count)

    }


    def extractKey(line: String): (String, String, String) = {

      apacheLogRegex.findFirstIn(line) match {

        case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>

          if (user != "\"-\"") (ip, user, query)

          else (null, null, null)

        case _ => (null, null, null)

      }

    }


    def extractStats(line: String): Stats = {

      apacheLogRegex.findFirstIn(line) match {

        case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>

          new Stats(1, bytes.toInt)

        case _ => new Stats(1, 0)

      }

    }


    dataSet.map(line => (extractKey(line), extractStats(line)))

      .reduceByKey((a, b) => a.merge(b))

      .collect().foreach{

        case (user, query) => println("%s\t%s".format(user, query))}


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.spark.rdd.RDD

import org.apache.spark.{SparkConf, SparkContext}


/**

  * Usage: MultiBroadcastTest [slices] [numElem]

  */

object MultiBroadcastTest {

  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test")

    val sc = new SparkContext(sparkConf)


    val slices = if (args.length > 0) args(0).toInt else 2

    val num = if (args.length > 1) args(1).toInt else 1000000


    val arr1 = new Array[Int](num)

    for (i <- 0 until arr1.length) {

      arr1(i) = i

    }


    val arr2 = new Array[Int](num)

    for (i <- 0 until arr2.length) {

      arr2(i) = i

    }


    val barr1 = sc.broadcast(arr1)

    val barr2 = sc.broadcast(arr2)

    val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ =>

      (barr1.value.size, barr2.value.size)

    }

    // Collect the small RDD so we can print the observed sizes locally.

    observedSizes.collect().foreach(i => println(i))


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/**

  * Usage: SimpleSkewedGroupByTest [numMappers] [numKVPairs] [valSize] [numReducers] [ratio]

  */

object SimpleSkewedGroupByTest {

  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")

    var numMappers = if (args.length > 0) args(0).toInt else 2

    var numKVPairs = if (args.length > 1) args(1).toInt else 1000

    var valSize = if (args.length > 2) args(2).toInt else 1000

    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    var ratio = if (args.length > 4) args(4).toInt else 5.0


    val sc = new SparkContext(sparkConf)


    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>

      val ranGen = new Random

      var result = new Array[(Int, Array[Byte])](numKVPairs)

      for (i <- 0 until numKVPairs) {

        val byteArr = new Array[Byte](valSize)

        ranGen.nextBytes(byteArr)

        val offset = ranGen.nextInt(1000) * numReducers

        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {

          // give ratio times higher chance of generating key 0 (for reducer 0)

          result(i) = (offset, byteArr)

        } else {

          // generate a key for one of the other reducers

          val key = 1 + ranGen.nextInt(numReducers-1) + offset

          result(i) = (key, byteArr)

        }

      }

      result

    }.cache

    // Enforce that everything has been calculated and in cache

    pairs1.count


    println("RESULT: " + pairs1.groupByKey(numReducers).count)

    // Print how many keys each reducer got (for debugging)

    // println("RESULT: " + pairs1.groupByKey(numReducers)

    //                           .map{case (k,v) => (k, v.size)}

    //                           .collectAsMap)


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/**

  * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]

  */

object SkewedGroupByTest {

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("GroupBy Test")

    var numMappers = if (args.length > 0) args(0).toInt else 2

    var numKVPairs = if (args.length > 1) args(1).toInt else 1000

    var valSize = if (args.length > 2) args(2).toInt else 1000

    var numReducers = if (args.length > 3) args(3).toInt else numMappers


    val sc = new SparkContext(sparkConf)


    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>

      val ranGen = new Random


      // map output sizes lineraly increase from the 1st to the last

      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt


      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)

      for (i <- 0 until numKVPairs) {

        val byteArr = new Array[Byte](valSize)

        ranGen.nextBytes(byteArr)

        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)

      }

      arr1

    }.cache()

    // Enforce that everything has been calculated and in cache

    pairs1.count()


    println(pairs1.groupByKey(numReducers).count())


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.commons.math3.linear._


import org.apache.spark._


/**

 * Alternating least squares matrix factorization.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.mllib.recommendation.ALS

 */

object SparkALS {


  // Parameters set through command line arguments

  var M = 0 // Number of movies

  var U = 0 // Number of users

  var F = 0 // Number of features

  var ITERATIONS = 0

  val LAMBDA = 0.01 // Regularization coefficient


  def generateR(): RealMatrix = {

    val mh = randomMatrix(M, F)

    val uh = randomMatrix(U, F)

    mh.multiply(uh.transpose())

  }


  def rmse(targetR: RealMatrix, ms: Array[RealVector], us: Array[RealVector]): Double = {

    val r = new Array2DRowRealMatrix(M, U)

    for (i <- 0 until M; j <- 0 until U) {

      r.setEntry(i, j, ms(i).dotProduct(us(j)))

    }

    val diffs = r.subtract(targetR)

    var sumSqs = 0.0

    for (i <- 0 until M; j <- 0 until U) {

      val diff = diffs.getEntry(i, j)

      sumSqs += diff * diff

    }

    math.sqrt(sumSqs / (M.toDouble * U.toDouble))

  }


  def update(i: Int, m: RealVector, us: Array[RealVector], R: RealMatrix) : RealVector = {

    val U = us.size

    val F = us(0).getDimension

    var XtX: RealMatrix = new Array2DRowRealMatrix(F, F)

    var Xty: RealVector = new ArrayRealVector(F)

    // For each user that rated the movie

    for (j <- 0 until U) {

      val u = us(j)

      // Add u * u^t to XtX

      XtX = XtX.add(u.outerProduct(u))

      // Add u * rating to Xty

      Xty = Xty.add(u.mapMultiply(R.getEntry(i, j)))

    }

    // Add regularization coefs to diagonal terms

    for (d <- 0 until F) {

      XtX.addToEntry(d, d, LAMBDA * U)

    }

    // Solve it with Cholesky

    new CholeskyDecomposition(XtX).getSolver.solve(Xty)

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of ALS and is given as an example!

        |Please use the ALS method found in org.apache.spark.mllib.recommendation

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    var slices = 0


    val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)


    options.toArray match {

      case Array(m, u, f, iters, slices_) =>

        M = m.getOrElse("100").toInt

        U = u.getOrElse("500").toInt

        F = f.getOrElse("10").toInt

        ITERATIONS = iters.getOrElse("5").toInt

        slices = slices_.getOrElse("2").toInt

      case _ =>

        System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")

        System.exit(1)

    }


    showWarning()


    println(s"Running with M=$M, U=$U, F=$F, iters=$ITERATIONS")


    val sparkConf = new SparkConf().setAppName("SparkALS")

    val sc = new SparkContext(sparkConf)


    val R = generateR()


    // Initialize m and u randomly

    var ms = Array.fill(M)(randomVector(F))

    var us = Array.fill(U)(randomVector(F))


    // Iteratively update movies then users

    val Rc  = sc.broadcast(R)

    var msb = sc.broadcast(ms)

    var usb = sc.broadcast(us)

    for (iter <- 1 to ITERATIONS) {

      println(s"Iteration $iter:")

      ms = sc.parallelize(0 until M, slices)

                .map(i => update(i, msb.value(i), usb.value, Rc.value))

                .collect()

      msb = sc.broadcast(ms) // Re-broadcast ms because it was updated

      us = sc.parallelize(0 until U, slices)

                .map(i => update(i, usb.value(i), msb.value, Rc.value.transpose()))

                .collect()

      usb = sc.broadcast(us) // Re-broadcast us because it was updated

      println("RMSE = " + rmse(R, ms, us))

      println()

    }


    sc.stop()

  }


  private def randomVector(n: Int): RealVector =

    new ArrayRealVector(Array.fill(n)(math.random))


  private def randomMatrix(rows: Int, cols: Int): RealMatrix =

    new Array2DRowRealMatrix(Array.fill(rows, cols)(math.random))


}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import scala.math.exp


import breeze.linalg.{Vector, DenseVector}

import org.apache.hadoop.conf.Configuration


import org.apache.spark._

import org.apache.spark.scheduler.InputFormatInfo



/**

 * Logistic regression based classification.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

 * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.

 */

object SparkHdfsLR {

  val D = 10   // Numer of dimensions

  val rand = new Random(42)


  case class DataPoint(x: Vector[Double], y: Double)


  def parsePoint(line: String): DataPoint = {

    val tok = new java.util.StringTokenizer(line, " ")

    var y = tok.nextToken.toDouble

    var x = new Array[Double](D)

    var i = 0

    while (i < D) {

      x(i) = tok.nextToken.toDouble; i += 1

    }

    DataPoint(new DenseVector(x), y)

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of Logistic Regression and is given as an example!

        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    if (args.length < 2) {

      System.err.println("Usage: SparkHdfsLR <file> <iters>")

      System.exit(1)

    }


    showWarning()


    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")

    val inputPath = args(0)

    val conf = new Configuration()

    val sc = new SparkContext(sparkConf,

      InputFormatInfo.computePreferredLocations(

        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))

      ))

    val lines = sc.textFile(inputPath)

    val points = lines.map(parsePoint _).cache()

    val ITERATIONS = args(1).toInt


    // Initialize w to a random value

    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}

    println("Initial w: " + w)


    for (i <- 1 to ITERATIONS) {

      println("On iteration " + i)

      val gradient = points.map { p =>

        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y

      }.reduce(_ + _)

      w -= gradient

    }


    println("Final w: " + w)

    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import breeze.linalg.{Vector, DenseVector, squaredDistance}


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/**

 * K-means clustering.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.mllib.clustering.KMeans

 */

object SparkKMeans {


  def parseVector(line: String): Vector[Double] = {

    DenseVector(line.split(' ').map(_.toDouble))

  }


  def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {

    var bestIndex = 0

    var closest = Double.PositiveInfinity


    for (i <- 0 until centers.length) {

      val tempDist = squaredDistance(p, centers(i))

      if (tempDist < closest) {

        closest = tempDist

        bestIndex = i

      }

    }


    bestIndex

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!

        |Please use the KMeans method found in org.apache.spark.mllib.clustering

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    if (args.length < 3) {

      System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")

      System.exit(1)

    }


    showWarning()


    val sparkConf = new SparkConf().setAppName("SparkKMeans")

    val sc = new SparkContext(sparkConf)

    val lines = sc.textFile(args(0))

    val data = lines.map(parseVector _).cache()

    val K = args(1).toInt

    val convergeDist = args(2).toDouble


    val kPoints = data.takeSample(withReplacement = false, K, 42).toArray

    var tempDist = 1.0


    while(tempDist > convergeDist) {

      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))


      val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)}


      val newPoints = pointStats.map {pair =>

        (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()


      tempDist = 0.0

      for (i <- 0 until K) {

        tempDist += squaredDistance(kPoints(i), newPoints(i))

      }


      for (newP <- newPoints) {

        kPoints(newP._1) = newP._2

      }

      println("Finished iteration (delta = " + tempDist + ")")

    }


    println("Final centers:")

    kPoints.foreach(println)

    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import scala.math.exp


import breeze.linalg.{Vector, DenseVector}


import org.apache.spark._


/**

 * Logistic regression based classification.

 * Usage: SparkLR [slices]

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

 * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.

 */

object SparkLR {

  val N = 10000  // Number of data points

  val D = 10   // Numer of dimensions

  val R = 0.7  // Scaling factor

  val ITERATIONS = 5

  val rand = new Random(42)


  case class DataPoint(x: Vector[Double], y: Double)


  def generateData = {

    def generatePoint(i: Int) = {

      val y = if(i % 2 == 0) -1 else 1

      val x = DenseVector.fill(D){rand.nextGaussian + y * R}

      DataPoint(x, y)

    }

    Array.tabulate(N)(generatePoint)

  }


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of Logistic Regression and is given as an example!

        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {


    showWarning()


    val sparkConf = new SparkConf().setAppName("SparkLR")

    val sc = new SparkContext(sparkConf)

    val numSlices = if (args.length > 0) args(0).toInt else 2

    val points = sc.parallelize(generateData, numSlices).cache()


    // Initialize w to a random value

    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}

    println("Initial w: " + w)


    for (i <- 1 to ITERATIONS) {

      println("On iteration " + i)

      val gradient = points.map { p =>

        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y

      }.reduce(_ + _)

      w -= gradient

    }


    println("Final w: " + w)


    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import org.apache.spark.SparkContext._

import org.apache.spark.{SparkConf, SparkContext}


/**

 * Computes the PageRank of URLs from an input file. Input file should

 * be in format of:

 * URL         neighbor URL

 * URL         neighbor URL

 * URL         neighbor URL

 * ...

 * where URL and their neighbors are separated by space(s).

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.graphx.lib.PageRank

 */

object SparkPageRank {


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of PageRank and is given as an example!

        |Please use the PageRank implementation found in org.apache.spark.graphx.lib.PageRank

        |for more conventional use.

      """.stripMargin)

  }


  def main(args: Array[String]) {

    if (args.length < 1) {

      System.err.println("Usage: SparkPageRank <file> <iter>")

      System.exit(1)

    }


    showWarning()


    val sparkConf = new SparkConf().setAppName("PageRank")

    val iters = if (args.length > 0) args(1).toInt else 10

    val ctx = new SparkContext(sparkConf)

    val lines = ctx.textFile(args(0), 1)

    val links = lines.map{ s =>

      val parts = s.split("\\s+")

      (parts(0), parts(1))

    }.distinct().groupByKey().cache()

    var ranks = links.mapValues(v => 1.0)


    for (i <- 1 to iters) {

      val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>

        val size = urls.size

        urls.map(url => (url, rank / size))

      }

      ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)

    }


    val output = ranks.collect()

    output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))


    ctx.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import scala.math.random


import org.apache.spark._


/** Computes an approximation to pi */

object SparkPi {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("Spark Pi")

    val spark = new SparkContext(conf)

    val slices = if (args.length > 0) args(0).toInt else 2

    val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow

    val count = spark.parallelize(1 until n, slices).map { i =>

      val x = random * 2 - 1

      val y = random * 2 - 1

      if (x*x + y*y < 1) 1 else 0

    }.reduce(_ + _)

    println("Pi is roughly " + 4.0 * count / n)

    spark.stop()

  }

}




/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import java.util.Random


import scala.math.exp


import breeze.linalg.{Vector, DenseVector}

import org.apache.hadoop.conf.Configuration


import org.apache.spark._

import org.apache.spark.scheduler.InputFormatInfo

import org.apache.spark.storage.StorageLevel



/**

 * Logistic regression based classification.

 * This example uses Tachyon to persist rdds during computation.

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

 * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.

 */

object SparkTachyonHdfsLR {

  val D = 10   // Numer of dimensions

  val rand = new Random(42)


  def showWarning() {

    System.err.println(

      """WARN: This is a naive implementation of Logistic Regression and is given as an example!

        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or

        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS

        |for more conventional use.

      """.stripMargin)

  }


  case class DataPoint(x: Vector[Double], y: Double)


  def parsePoint(line: String): DataPoint = {

    val tok = new java.util.StringTokenizer(line, " ")

    var y = tok.nextToken.toDouble

    var x = new Array[Double](D)

    var i = 0

    while (i < D) {

      x(i) = tok.nextToken.toDouble; i += 1

    }

    DataPoint(new DenseVector(x), y)

  }


  def main(args: Array[String]) {


    showWarning()


    val inputPath = args(0)

    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")

    val conf = new Configuration()

    val sc = new SparkContext(sparkConf,

      InputFormatInfo.computePreferredLocations(

        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))

      ))

    val lines = sc.textFile(inputPath)

    val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)

    val ITERATIONS = args(1).toInt


    // Initialize w to a random value

    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}

    println("Initial w: " + w)


    for (i <- 1 to ITERATIONS) {

      println("On iteration " + i)

      val gradient = points.map { p =>

        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y

      }.reduce(_ + _)

      w -= gradient

    }


    println("Final w: " + w)

    sc.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import scala.math.random


import org.apache.spark._

import org.apache.spark.storage.StorageLevel


/**

 *  Computes an approximation to pi

 *  This example uses Tachyon to persist rdds during computation.

 */

object SparkTachyonPi {

  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SparkTachyonPi")

    val spark = new SparkContext(sparkConf)


    val slices = if (args.length > 0) args(0).toInt else 2

    val n = 100000 * slices


    val rdd = spark.parallelize(1 to n, slices)

    rdd.persist(StorageLevel.OFF_HEAP)

    val count = rdd.map { i =>

      val x = random * 2 - 1

      val y = random * 2 - 1

      if (x * x + y * y < 1) 1 else 0

    }.reduce(_ + _)

    println("Pi is roughly " + 4.0 * count / n)


    spark.stop()

  }

}



/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


package org.apache.spark.examples


import scala.util.Random

import scala.collection.mutable


import org.apache.spark.{SparkConf, SparkContext}

import org.apache.spark.SparkContext._


/**

 * Transitive closure on a graph.

 */

object SparkTC {

  val numEdges = 200

  val numVertices = 100

  val rand = new Random(42)


  def generateGraph = {

    val edges: mutable.Set[(Int, Int)] = mutable.Set.empty

    while (edges.size < numEdges) {

      val from = rand.nextInt(numVertices)

      val to = rand.nextInt(numVertices)

      if (from != to) edges.+=((from, to))

    }

    edges.toSeq

  }


  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SparkTC")

    val spark = new SparkContext(sparkConf)

    val slices = if (args.length > 0) args(0).toInt else 2

    var tc = spark.parallelize(generateGraph, slices).cache()


    // Linear transitive closure: each round grows paths by one edge,

    // by joining the graph's edges with the already-discovered paths.

    // e.g. join the path (y, z) from the TC with the edge (x, y) from

    // the graph to obtain the path (x, z).


    // Because join() joins on keys, the edges are stored in reversed order.

    val edges = tc.map(x => (x._2, x._1))


    // This join is iterated until a fixed point is reached.

    var oldCount = 0L

    var nextCount = tc.count()

    do {

      oldCount = nextCount

      // Perform the join, obtaining an RDD of (y, (z, x)) pairs,

      // then project the result to obtain the new (x, z) paths.

      tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache()

      nextCount = tc.count()

    } while (nextCount != oldCount)


    println("TC has " + tc.count() + " edges.")

    spark.stop()

  }

}