spark3.0.1安装

规划

cancer01 master/worker

cancer02 worker

cancer03 worker

cancer04 worker

cancer05 worker

 

准备

su hadoop

 

安装scala

每台机器上

cd /usr/local

tar zxf scala-2.13.4.tgz

mv scala-2.13.4 scala

chown -R hadoop:hadoop scala

vim /etc/profile

export SCALA_HOME=/usr/local/scala

export PATH=$PATH:$SCALA_HOME/bin

source /etc/profile

 

安装spark

wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.1-bin-hadoop2.7.tgz

tar zxf spark-2.0.1-bin-hadoop2.7.tgz

mv spark-2.0.1-bin-hadoop2.7 /usr/local/spark

chown -R hadoop:hadoop spark

vim /etc/profile

export SPARK_HOME=/usr/local/spark

export PATH=$PATH:$SPARK_HOME/bin

source /etc/profile

 

配置

cd /usr/local/spark/conf

mv spark-env.sh.template spark-env.sh

vim spark-env.sh

export JAVA_HOME=/usr/local/jdk1.8.0_271

export SCALA_HOME=/usr/local/scala

export HADOOP_HOME=/usr/local/hadoop

export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop

export SPARK_MASTER_IP=cancer01

export SPARK_WORKER_MEMORY=1G        #每个worker节点能够最大分配给exectors的内存大小

export SPARK_WORKER_CORES=2            #每个worker节点所占有的CPU核数目

export SPARK_WORKER_INSTANCES=1        #每台机器上开启的worker节点的数目

 

cp slaves.template slaves & vim slaves

slave1

slave2

slave3

slave4

 

复制

在cancer02|03|04|05上建立/usr/local/spark目录

scp –r spark hadoop@cancer02:/usr/local/

scp –r spark hadoop@cancer03:/usr/local/

scp –r spark hadoop@cancer04:/usr/local/

scp –r spark hadoop@cancer05:/usr/local/

 

启动

$HADOOP_HOME/sbin/start-all.sh

$SPARK_HOME/sbin/start-all.sh

或者

$SPARK_HOME/sbin/start-master.sh

$SPARK_HOME/sbin/start-slaves.sh

 

验证

http://cancer01:9870 (hadoop访问)

http://cancer01:8080 (spark主节点访问)

http://cancer01:8081 (spark子节点访问)

 

运行

./bin/spark-shell

 

验证(python)

sc.textFile("wordcount.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect

访问:http://localhost:4040

 

验证(java)

package com.xzrj.demo.sp;

 

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.sql.Dataset;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.RowFactory;

import org.apache.spark.sql.SparkSession;

import org.apache.spark.sql.types.DataTypes;

import org.apache.spark.sql.types.StructField;

import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;

import java.util.List;

 

public class SparkSqlDemoA {

 

    public static void main(String[] args) {

 

//        JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkSqlDemoA").setMaster("local"));

 

        SparkSession spark = SparkSession.builder().appName("SparkSqlDemoA").master("local").getOrCreate();   // spark://localhost:7077

        JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

        // 读取行

        JavaRDD<String> lines = sc.textFile("hdfs://localhost:9000/in/person.txt");

        // 行数据

        JavaRDD<Row> rows = lines.map(new Function<String, Row>() {

            public Row call(String line) throws Exception {

                String[] tmp = line.split(" ");

                return RowFactory.create(Integer.parseInt(tmp[0]), tmp[1].trim(), Integer.parseInt(tmp[2]));

            }

        });

 

        // 表字段

        List<StructField> sf = new ArrayList<StructField>();

        sf.add(DataTypes.createStructField("ID", DataTypes.IntegerType, true));

        sf.add(DataTypes.createStructField("NAME", DataTypes.StringType, true));

        sf.add(DataTypes.createStructField("AGE", DataTypes.IntegerType, true));

 

        // 表结构

        StructType st = DataTypes.createStructType(sf);

        // 表结构+行数据

        Dataset<Row> df = spark.createDataFrame(rows, st);

        // 表名

        df.createOrReplaceTempView("person");

 

        // 查询

        Dataset<Row> data = spark.sql("select * from person where name like 't%' ");

        data.show();

 

        // 关闭

        sc.close();

    }

}

运行

spark-submit --class com.xzrj.demo.sp.SparkSqlDemoA spdemo-1.0.jar

spark-submit --class com.xzrj.demo.sp.SparkSqlDemoA --master local --deploy-mode client --executor-memory 512M --total-executor-cores 1 ~/ spdemo-1.0.jar

 

 

代码解析:

// 函数式编程

JavaRDD<String> textFile = sc.textFile("hdfs://...");

JavaPairRDD<String, Integer> counts = textFile.flatMap(s -> Arrays.asList(s.split(" ")).iterator()).mapToPair(word -> new Tuple2<>(word, 1)).reduceByKey((a, b) -> a + b);

counts.saveAsTextFile("hdfs://...");

 

// 原始写法

JavaRDD<String> lines = sc.textFile("hdfs://localhost:9000/in/wordcount.txt");

JavaRDD<String> words = lines.flatMap(

    new FlatMapFunction<String, String>() {

        @Override

        public Iterator<String> call(String s) throws Exception {

            return Arrays.asList(s.split(" ")).iterator();

        }

    }

);

JavaPairRDD<String, Integer> pairs = words.mapToPair(

    new PairFunction<String, String, Integer>() {

        @Override

        public Tuple2<String, Integer> call(String s) throws Exception {

            return new Tuple2<String, Integer>(s, 1);

        }

    }

);

JavaPairRDD<String, Integer> pairRDD = pairs.reduceByKey(

    new Function2<Integer, Integer, Integer>() {

        @Override

        public Integer call(Integer i1, Integer i2) throws Exception {

            return i1 + i2;

        }

    }

);

pairRDD.foreach(

    new VoidFunction<Tuple2<String, Integer>>() {

        @Override

        public void call(Tuple2<String, Integer> s) throws Exception {

            System.out.println(s._1 + "  " + s._2);

        }

    }

);

FileSystem fs = FileSystem.get(new URI("hdfs://localhost:9000"), new Configuration(), "admin");

Path p = new Path("/out");

if(fs.exists(p))

    fs.delete(p, true);

 

pairRDD.saveAsTextFile("hdfs://localhost:9000/out");

System.out.println("单词统计成功了!!");

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值