spark running script setting

1. build a sh(sparkrun.sh) scripts as follows;

#!/usr/bin/env bash
export SPARK_MEM=3g
CP=$CLASSPATH:/opt/spark-0.9.0-incubating-bin-hadoop2/conf:/opt/spark-0.9.0-incubating-bin-hadoop2/assembly/target/scala-2.10/spark-assembly_2.10-0.9.0-incubating-hadoop2.2.0.jar:./*
exec java -cp "$CP" "$@"

you can run as: ./sparkrun.sh   [MainClass]


2. build a spark demo, package it to a jar file, & upload it to ./*, can use java or scala.

    the spark application depend jar file as follows:

spark-mllib_2.10-0.9.0-incubating.jar
spark-streaming_2.10-0.9.0-incubating.jar
spark-graphx_2.10-0.9.0-incubating.jar
spark-assembly_2.10-0.9.0-incubating-hadoop2.2.0.jar

  example 1 use java:  a machine learning method Kmeans using spark.

 

import java.io.Serializable;
import java.util.List;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import scala.Tuple2;

public class MyKMeans2 {

	/**
	 * @param args
	 */
	public static void main(String[] args) {

		int k = 3;

		String[] arr = new String[2];
		arr[0] = "spark://masterIP:7077";
		arr[1] = "hdfs://namenodeIP:9000/test/kmeans_data.txt";
		JavaSparkContext ctx = new JavaSparkContext(arr[0], "MyKeans",
				System.getenv("SPARK_HOME"),
				JavaSparkContext.jarOfClass(MyKMeans2.class));
		JavaRDD<String> data = ctx.textFile(arr[1]);

		int iter = 20;
		double threshold = 0.0001;

		List<String> ls = data.takeSample(false, k, 1);
		System.out.println("LS:=" + ls.size());

		Point[] curCenterPoint;
		curCenterPoint = new Point[k];
		for (int i = 0; i < k; i++) {
			curCenterPoint[i] = new Point();
			String str[] = ls.get(i).split(" ");
			curCenterPoint[i].setX1(Double.parseDouble(str[0]));
			curCenterPoint[i].setX2(Double.parseDouble(str[1]));
			curCenterPoint[i].setX3(Double.parseDouble(str[2]));
		}

		JavaRDD<Point> data1=null;
		JavaPairRDD<Integer, String> data2;
		Broadcast<Point[]> bp = ctx.broadcast(curCenterPoint);

		for (int i = 0; i < iter; i++) {
			data1 = data.map(new ComputerClass(curCenterPoint, k));

			
			data2 = data1.map(new PairFunction<Point, Integer, String>() {
				@Override
				public Tuple2<Integer, String> call(Point p) throws Exception {
					int k = p.getType();
					String v = "" + p.getX1() + " " + p.getX2() + " "
							+ p.getX3();
					Tuple2<Integer, String> t = new Tuple2<Integer, String>(k,
							v);
					return t;
				}
			});

			JavaPairRDD<Integer, String> data3 = data2
					.reduceByKey(new Function2<String, String, String>() {

						@Override
						public String call(String arg0, String arg1)
								throws Exception {
							String strs1[] = arg0.split(" ");
							String strs2[] = arg1.split(" ");

							double x1 = (Double.parseDouble(strs1[0]) + Double
									.parseDouble(strs2[0])) / 2;
							double x2 = (Double.parseDouble(strs1[1]) + Double
									.parseDouble(strs2[1])) / 2;
							double x3 = (Double.parseDouble(strs1[2]) + Double
									.parseDouble(strs2[2])) / 2;

							return x1 + " " + x2 + " " + x3;
						}
					});

			Point[] NewCenter = new Point[k];

			List<Tuple2<Integer, String>> ls3 = data3.collect();

			for (int j = 0; j < k; j++) {
				NewCenter[j] = new Point();
				Tuple2<Integer, String> t = ls3.get(j);
				NewCenter[j].setType(t._1);
				String strs[] = t._2.split(" ");
				NewCenter[j].setX1(Double.parseDouble(strs[0]));
				NewCenter[j].setX2(Double.parseDouble(strs[1]));
				NewCenter[j].setX3(Double.parseDouble(strs[2]));
			}

			double ms = computerCenterDistance(curCenterPoint, NewCenter, k);
			if (ms < threshold) {
				break;
			}

			data = data1.map(new Function<Point, String>() {
				@Override
				public String call(Point p) throws Exception {
					return p.getX1() + " " + p.getX2() + " " + p.getX3();
				}
			});

			System.out.println("ite:=" + i);

			curCenterPoint = NewCenter;

		}
		
		System.out.println("classfy point====================");
		List<Point> ls1 = data1.collect();
		for (Point p : ls1) {
			System.out.println(p.getType() + " " + p.getX1() + " "
					+ p.getX2() + " " + p.getX3());
		}


		System.out.println("center size:="+curCenterPoint.length);
		for (int i = 0; i < k; i++) {
			Integer type = curCenterPoint[i].getType();
			String x1 = curCenterPoint[i].getX1() + "";
			String x2 = curCenterPoint[i].getX2() + "";
			String x3 = curCenterPoint[i].getX3() + "";
			System.out.println(type + " " + x1 + " " + x2 + " " + x3);
		}
	}

	static double computerPointDistance(Point p1, Point p2) {
		double ret = 0;
		double x1 = p1.getX1();
		double x2 = p1.getX2();
		double x3 = p1.getX3();

		double y1 = p2.getX1();
		double y2 = p2.getX2();
		double y3 = p2.getX3();

		ret = (Math.pow((x1 - y1), 2) + Math.pow((x2 - y2), 2) + Math.pow(
				(x3 - y3), 2));

		return ret;
	}

	static double computerCenterDistance(Point[] oldCenter, Point[] newCenter,
			int k) {
		double ret = 0;

		for (int i = 0; i < k; i++) {
			double ox1 = oldCenter[i].getX1();
			double ox2 = oldCenter[i].getX2();
			double ox3 = oldCenter[i].getX3();

			double nx1 = newCenter[i].getX1();
			double nx2 = newCenter[i].getX2();
			double nx3 = newCenter[i].getX3();

			ret += (Math.pow((nx1 - ox1), 2) + Math.pow((nx2 - ox2), 2) + Math
					.pow((nx3 - ox3), 2));
		}

		return ret;
	}

	static class ComputerClass extends Function<String, Point> {
		private Point[] centerPoints;
		private int k;

		public ComputerClass(Point[] points, int k) {
			this.centerPoints = points;
			this.k = k;
		}

		@Override
		public Point call(String line) throws Exception {
			Point p = new Point();
			String[] strs = line.split(" ");
			p.setX1(Double.parseDouble(strs[0]));
			p.setX2(Double.parseDouble(strs[1]));
			p.setX3(Double.parseDouble(strs[2]));

			int type1 = 0;
			double D = 99999;

			for (int j = 0; j < k; j++) {
				double temp1 = computerPointDistance(p, centerPoints[j]);
				if (temp1 < D) {
					D = temp1;
					type1 = j;
				}
			}

			p.setType(type1);
			return p;
		}
	}

	static class Point implements Serializable {
		private static final long serialVersionUID = 1L;
		private int type;
		private double x1;
		private double x2;
		private double x3;

		public int getType() {
			return type;
		}

		public void setType(int type) {
			this.type = type;
		}

		public double getX1() {
			return x1;
		}

		public void setX1(double x1) {
			this.x1 = x1;
		}

		public double getX2() {
			return x2;
		}

		public void setX2(double x2) {
			this.x2 = x2;
		}

		public double getX3() {
			return x3;
		}

		public void setX3(double x3) {
			this.x3 = x3;
		}
	}

}

the testing data as follows:

0.0 0.0 0.0
0.1 0.1 0.1
0.2 0.2 0.2
9.0 9.0 9.0
9.1 9.1 9.1
9.2 9.2 9.2
2 2 2
2.1 2.1 2.1
2.3 2.4 2.6


example 2: spark code use scala

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.serializer.KryoRegistrator
import com.esotericsoftware.kryo.Kryo
import org.apache.spark.rdd._
import org.apache.spark.SparkContext._
import org.apache.spark._
import org.apache.spark.api.java._

class MyRegistrator extends KryoRegistrator {
  override def registerClasses(kryo: Kryo) {
  }
}

object SortTest {
  def main(args: Array[String]): Unit = {
    val arr = new Array[String](2)
    arr(0) = "spark://masterIP:7077"
    arr(1) = "hdfs://nameNodeIP:9000/test/" + args(0)
    val conf = new SparkConf().setMaster(arr(0)).setAppName("sorttest")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryo.registrator", "demo.MyRegistrator")
    conf.set("spark.storage.memoryFraction", "0.5")
    conf.setSparkHome(System.getenv("SPARK_HOME"))
    conf.setJars(SparkContext.jarOfClass(this.getClass()))
    val sc = new SparkContext(conf);
    val dataset1:RDD[(Long, String)] = sc.textFile(arr(1), 1000).map(line => (line.split(',')(0).toLong, line))
    val datasets=dataset1.sortByKey(false, 1000);
    val arrdata2:Array[(Long, String)] =datasets.take(100);
    for(i<-0 to arrdata2.length-1)
    {
      println(i+"  "+arrdata2(i)._1,arrdata2(i)._2)
    }
    println(arrdata2.length)
  }
}


 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值