规划
cancer01 master/worker
cancer02 worker
cancer03 worker
cancer04 worker
cancer05 worker
准备
su hadoop
安装scala
每台机器上
cd /usr/local
tar zxf scala-2.13.4.tgz
mv scala-2.13.4 scala
chown -R hadoop:hadoop scala
vim /etc/profile
export SCALA_HOME=/usr/local/scala
export PATH=$PATH:$SCALA_HOME/bin
source /etc/profile
安装spark
wget http://d3kbcqa49mib13.cloudfront.net/spark-2.0.1-bin-hadoop2.7.tgz
tar zxf spark-2.0.1-bin-hadoop2.7.tgz
mv spark-2.0.1-bin-hadoop2.7 /usr/local/spark
chown -R hadoop:hadoop spark
vim /etc/profile
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin
source /etc/profile
配置
cd /usr/local/spark/conf
mv spark-env.sh.template spark-env.sh
vim spark-env.sh
export JAVA_HOME=/usr/local/jdk1.8.0_271
export SCALA_HOME=/usr/local/scala
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_MASTER_IP=cancer01
export SPARK_WORKER_MEMORY=1G #每个worker节点能够最大分配给exectors的内存大小
export SPARK_WORKER_CORES=2 #每个worker节点所占有的CPU核数目
export SPARK_WORKER_INSTANCES=1 #每台机器上开启的worker节点的数目
cp slaves.template slaves & vim slaves
slave1
slave2
slave3
slave4
复制
在cancer02|03|04|05上建立/usr/local/spark目录
scp –r spark hadoop@cancer02:/usr/local/
scp –r spark hadoop@cancer03:/usr/local/
scp –r spark hadoop@cancer04:/usr/local/
scp –r spark hadoop@cancer05:/usr/local/
启动
$HADOOP_HOME/sbin/start-all.sh
$SPARK_HOME/sbin/start-all.sh
或者
$SPARK_HOME/sbin/start-master.sh
$SPARK_HOME/sbin/start-slaves.sh
验证
http://cancer01:9870 (hadoop访问)
http://cancer01:8080 (spark主节点访问)
http://cancer01:8081 (spark子节点访问)
运行
./bin/spark-shell
验证(python)
sc.textFile("wordcount.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect
验证(java)
package com.xzrj.demo.sp;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import java.util.ArrayList; import java.util.List;
public class SparkSqlDemoA {
public static void main(String[] args) {
// JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkSqlDemoA").setMaster("local"));
SparkSession spark = SparkSession.builder().appName("SparkSqlDemoA").master("local").getOrCreate(); // spark://localhost:7077 JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // 读取行 JavaRDD<String> lines = sc.textFile("hdfs://localhost:9000/in/person.txt"); // 行数据 JavaRDD<Row> rows = lines.map(new Function<String, Row>() { public Row call(String line) throws Exception { String[] tmp = line.split(" "); return RowFactory.create(Integer.parseInt(tmp[0]), tmp[1].trim(), Integer.parseInt(tmp[2])); } });
// 表字段 List<StructField> sf = new ArrayList<StructField>(); sf.add(DataTypes.createStructField("ID", DataTypes.IntegerType, true)); sf.add(DataTypes.createStructField("NAME", DataTypes.StringType, true)); sf.add(DataTypes.createStructField("AGE", DataTypes.IntegerType, true));
// 表结构 StructType st = DataTypes.createStructType(sf); // 表结构+行数据 Dataset<Row> df = spark.createDataFrame(rows, st); // 表名 df.createOrReplaceTempView("person");
// 查询 Dataset<Row> data = spark.sql("select * from person where name like 't%' "); data.show();
// 关闭 sc.close(); } } |
运行
spark-submit --class com.xzrj.demo.sp.SparkSqlDemoA spdemo-1.0.jar
spark-submit --class com.xzrj.demo.sp.SparkSqlDemoA --master local --deploy-mode client --executor-memory 512M --total-executor-cores 1 ~/ spdemo-1.0.jar
代码解析:
// 函数式编程 JavaRDD<String> textFile = sc.textFile("hdfs://..."); JavaPairRDD<String, Integer> counts = textFile.flatMap(s -> Arrays.asList(s.split(" ")).iterator()).mapToPair(word -> new Tuple2<>(word, 1)).reduceByKey((a, b) -> a + b); counts.saveAsTextFile("hdfs://...");
// 原始写法 JavaRDD<String> lines = sc.textFile("hdfs://localhost:9000/in/wordcount.txt"); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")).iterator(); } } ); JavaPairRDD<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s, 1); } } ); JavaPairRDD<String, Integer> pairRDD = pairs.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) throws Exception { return i1 + i2; } } ); pairRDD.foreach( new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> s) throws Exception { System.out.println(s._1 + " " + s._2); } } ); FileSystem fs = FileSystem.get(new URI("hdfs://localhost:9000"), new Configuration(), "admin"); Path p = new Path("/out"); if(fs.exists(p)) fs.delete(p, true);
pairRDD.saveAsTextFile("hdfs://localhost:9000/out"); System.out.println("单词统计成功了!!"); |